blob: 9048336f3fa07d96ae049759c3f343f779e9b2be [file] [log] [blame]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001// Test target codegen - host bc file has to be created first.
2// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
3// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
4// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
5// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
Alexey Bataev8061acd2019-02-20 16:36:22 +00006// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00007// expected-no-diagnostics
8#ifndef HEADER
9#define HEADER
10
Alexey Bataev8061acd2019-02-20 16:36:22 +000011// CHECK-DAG: [[TEAM1_REDUCE_TY:%.+]] = type { [{{1024|2048}} x double] }
12// CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] }
13// CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] }
14// CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] }
15// CHECK-DAG: [[MAP_TY:%.+]] = type { [128 x i8] }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000016
Alexey Bataevf2f39be2018-11-16 19:38:21 +000017// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
18// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
19// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
Alexey Bataevf2f39be2018-11-16 19:38:21 +000020// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
21// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
Alexey Bataevf2f39be2018-11-16 19:38:21 +000022
23// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
24// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
25
26// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
Alexey Bataev8061acd2019-02-20 16:36:22 +000027// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 1
28// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 1
29// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0
30
Alexey Bataev7b3eabd2019-03-13 18:21:10 +000031// CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000032
33template<typename tx>
34tx ftemplate(int n) {
35 int a;
36 short b;
37 tx c;
38 float d;
39 double e;
40
41 #pragma omp target
42 #pragma omp teams reduction(+: e)
43 {
44 e += 5;
45 }
46
47 #pragma omp target
48 #pragma omp teams reduction(^: c) reduction(*: d)
49 {
50 c ^= 2;
51 d *= 33;
52 }
53
54 #pragma omp target
55 #pragma omp teams reduction(|: a) reduction(max: b)
Alexey Bataevf2f39be2018-11-16 19:38:21 +000056 #pragma omp parallel reduction(|: a) reduction(max: b)
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000057 {
58 a |= 1;
59 b = 99 > b ? 99 : b;
60 }
61
62 return a+b+c+d+e;
63}
64
65int bar(int n){
66 int a = 0;
67
68 a += ftemplate<char>(n);
69
70 return a;
71}
72
Alexey Bataev8061acd2019-02-20 16:36:22 +000073 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l41}}_worker()
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000074
Alexey Bataev8061acd2019-02-20 16:36:22 +000075 // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l41]](
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000076 //
77 // CHECK: {{call|invoke}} void [[T1]]_worker()
78 //
79 // CHECK: call void @__kmpc_kernel_init(
Alexey Bataev8c5555c2019-05-21 15:11:58 +000080 // CHECK: call void @__kmpc_kernel_deinit(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000081 //
82 // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
83 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
84 // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
85 // CHECK: store double [[ADD]], double* [[E]], align
Alexey Bataev8061acd2019-02-20 16:36:22 +000086 // CHECK: [[GEP1:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
87 // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8*
88 // CHECK: store i8* [[BC]], i8** [[GEP1]],
89 // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8*
Alexey Bataev8c5555c2019-05-21 15:11:58 +000090 // CHECK: [[BUF:%.+]] = load i8*, i8** @
91 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000092 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
93 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
94 //
95 // CHECK: [[IFLABEL]]
96 // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
97 // CHECK: [[EV:%.+]] = load double, double* [[E]], align
98 // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
99 // CHECK: store double [[ADD]], double* [[E_IN]], align
Alexey Bataev8061acd2019-02-20 16:36:22 +0000100 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000101 // CHECK: br label %[[EXIT]]
102 //
103 // CHECK: [[EXIT]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000104
Alexey Bataev8061acd2019-02-20 16:36:22 +0000105 //
106 // Reduction function
107 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
108 // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
109 // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
110 // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
111 //
112 // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
113 // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
114 // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double*
115 //
116 // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]],
117 // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]],
118 // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]]
119 // CHECK: store double [[RES]], double* [[VAR_LHS]],
120 // CHECK: ret void
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000121
Alexey Bataev8061acd2019-02-20 16:36:22 +0000122 //
123 // Shuffle and reduce function
124 // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
125 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [1 x i8*], align
126 // CHECK: [[REMOTE_ELT:%.+]] = alloca double
127 //
128 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
129 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
130 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
131 //
132 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
133 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
134 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
135 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
136 //
137 // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
138 // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
139 // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
140 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
141 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
142 // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
143 //
144 // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align
145 // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8*
146 // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align
147 //
148 // Condition to reduce
149 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
150 //
151 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
152 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
153 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
154 //
155 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
156 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
157 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
158 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
159 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
160 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
161 //
162 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
163 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
164 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
165 //
166 // CHECK: [[DO_REDUCE]]
167 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8*
168 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [1 x i8*]* [[REMOTE_RED_LIST]] to i8*
169 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
170 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
171 //
172 // CHECK: [[REDUCE_ELSE]]
173 // CHECK: br label {{%?}}[[REDUCE_CONT]]
174 //
175 // CHECK: [[REDUCE_CONT]]
176 // Now check if we should just copy over the remote reduction list
177 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
178 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
179 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
180 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
181 //
182 // CHECK: [[DO_COPY]]
183 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
184 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
185 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
186 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
187 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double*
188 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
189 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align
190 // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align
191 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
192 //
193 // CHECK: [[COPY_ELSE]]
194 // CHECK: br label {{%?}}[[COPY_CONT]]
195 //
196 // CHECK: [[COPY_CONT]]
197 // CHECK: void
198
199 //
200 // Inter warp copy function
201 // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32)
202 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
203 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
204 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [1 x i8*]*
205 // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
206 // CHECK: br label
207 // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
208 // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
209 // CHECK: br i1 [[DONE_COPY]], label
210 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
211 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
212 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
213 //
214 // [[DO_COPY]]
215 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
216 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
217 // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
218 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
219 //
220 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
221 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
222 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
223 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
224 //
225 // CHECK: [[COPY_ELSE]]
226 // CHECK: br label {{%?}}[[COPY_CONT]]
227 //
228 // Barrier after copy to shared memory storage medium.
229 // CHECK: [[COPY_CONT]]
230 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
231 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
232 //
233 // Read into warp 0.
234 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
235 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
236 //
237 // CHECK: [[DO_READ]]
238 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
239 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
240 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
241 // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
242 // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
243 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
244 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
245 // CHECK: br label {{%?}}[[READ_CONT:.+]]
246 //
247 // CHECK: [[READ_ELSE]]
248 // CHECK: br label {{%?}}[[READ_CONT]]
249 //
250 // CHECK: [[READ_CONT]]
251 // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
252 // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
253 // CHECK: br label
254 // CHECK: ret
255
256 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*)
257 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
258 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
259 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
260 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
261 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
262 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
263 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
264 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]*
265 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
266 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
267 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
268 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
269 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
270 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double*
271 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
272 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
273 // CHECK: [[LOC_RED1:%.+]] = load double, double* [[RL_RED1]],
274 // CHECK: store double [[LOC_RED1]], double* [[GLOBAL_RED1_IDX_PTR]],
275 // CHECK: ret void
276
277 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*)
278 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
279 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
280 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
281 // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*],
282 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
283 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
284 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
285 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
286 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
287 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
288 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
289 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
290 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
291 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8*
292 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
293 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8*
294 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
295 // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
296 // CHECK: ret void
297
298 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*)
299 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
300 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
301 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
302 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
303 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
304 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
305 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
306 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]*
307 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
308 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
309 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
310 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
311 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
312 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double*
313 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
314 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
315 // CHECK: [[GLOBAL_RED1:%.+]] = load double, double* [[GLOBAL_RED1_IDX_PTR]],
316 // CHECK: store double [[GLOBAL_RED1]], double* [[RL_RED1]],
317 // CHECK: ret void
318
319 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*)
320 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
321 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
322 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
323 // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*],
324 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
325 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
326 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
327 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
328 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]*
329 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
330 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
331 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
332 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
333 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8*
334 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
335 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8*
336 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
337 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
338 // CHECK: ret void
339
340 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}_worker()
341
342 // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l47]](
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000343 //
344 // CHECK: {{call|invoke}} void [[T2]]_worker()
Gheorghe-Teodor Bercea2b404702018-11-29 20:53:49 +0000345
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000346 //
347 // CHECK: call void @__kmpc_kernel_init(
Alexey Bataev8c5555c2019-05-21 15:11:58 +0000348 // CHECK: call void @__kmpc_kernel_deinit(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000349 //
350 // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align
351 // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align
352 // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32
353 // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2
354 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
355 // CHECK: store i8 [[TRUNC]], i8* [[C]], align
356 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
357 // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}}
358 // CHECK: store float [[MUL]], float* [[D]], align
Alexey Bataev8061acd2019-02-20 16:36:22 +0000359 // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
360 // CHECK: store i8* [[C]], i8** [[GEP1]],
361 // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
362 // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8*
363 // CHECK: store i8* [[BC]], i8** [[GEP2]],
364 // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
Alexey Bataev8c5555c2019-05-21 15:11:58 +0000365 // CHECK: [[BUF:%.+]] = load i8*, i8** @
366 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000367 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
368 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
369 //
370 // CHECK: [[IFLABEL]]
371 // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
372 // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
373 // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
374 // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32
375 // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]]
376 // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8
377 // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align
378 // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align
379 // CHECK: [[DV:%.+]] = load float, float* [[D]], align
380 // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
381 // CHECK: store float [[MUL]], float* [[D_IN]], align
Alexey Bataev8061acd2019-02-20 16:36:22 +0000382 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000383 // CHECK: br label %[[EXIT]]
384 //
385 // CHECK: [[EXIT]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000386
Alexey Bataev8061acd2019-02-20 16:36:22 +0000387 //
388 // Reduction function
389 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
390 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
391 // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
392 //
393 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
394 // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
395 //
396 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1
397 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
398 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float*
399 //
400 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1
401 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
402 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float*
403 //
404 // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]],
405 // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32
406 // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]],
407 // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32
408 // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
409 // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8
410 // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]],
411 //
412 // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]],
413 // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]],
414 // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
415 // CHECK: store float [[RES]], float* [[VAR2_LHS]],
416 // CHECK: ret void
417
418 //
419 // Shuffle and reduce function
420 // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
421 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align
422 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8
423 // CHECK: [[REMOTE_ELT2:%.+]] = alloca float
424 //
425 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
426 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
427 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
428 //
429 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
430 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
431 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
432 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
433 //
434 // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
435 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
436 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
437 // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
438 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
439 //
440 // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align
441 // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align
442 //
443 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
444 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
445 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
446 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
447 //
448 // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
449 // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
450 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
451 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
452 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
453 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
454 //
455 // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align
456 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8*
457 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
458 //
459 // Condition to reduce
460 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
461 //
462 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
463 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
464 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
465 //
466 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
467 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
468 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
469 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
470 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
471 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
472 //
473 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
474 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
475 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
476 //
477 // CHECK: [[DO_REDUCE]]
478 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
479 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8*
480 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
481 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
482 //
483 // CHECK: [[REDUCE_ELSE]]
484 // CHECK: br label {{%?}}[[REDUCE_CONT]]
485 //
486 // CHECK: [[REDUCE_CONT]]
487 // Now check if we should just copy over the remote reduction list
488 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
489 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
490 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
491 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
492 //
493 // CHECK: [[DO_COPY]]
494 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
495 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
496 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
497 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
498 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align
499 // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align
500 //
501 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
502 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
503 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
504 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
505 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float*
506 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
507 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align
508 // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align
509 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
510 //
511 // CHECK: [[COPY_ELSE]]
512 // CHECK: br label {{%?}}[[COPY_CONT]]
513 //
514 // CHECK: [[COPY_CONT]]
515 // CHECK: void
516
517 //
518 // Inter warp copy function
519 // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32)
520 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
521 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
522 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [2 x i8*]*
523 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
524 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
525 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
526 //
527 // [[DO_COPY]]
528 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
529 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
530 //
531 // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
532 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
533 // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
534 // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
535 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
536 //
537 // CHECK: [[COPY_ELSE]]
538 // CHECK: br label {{%?}}[[COPY_CONT]]
539 //
540 // Barrier after copy to shared memory storage medium.
541 // CHECK: [[COPY_CONT]]
542 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
543 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
544 //
545 // Read into warp 0.
546 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
547 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
548 //
549 // CHECK: [[DO_READ]]
550 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
551 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
552 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
553 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
554 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
555 // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
556 // CHECK: br label {{%?}}[[READ_CONT:.+]]
557 //
558 // CHECK: [[READ_ELSE]]
559 // CHECK: br label {{%?}}[[READ_CONT]]
560 //
561 // CHECK: [[READ_CONT]]
562 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
563 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
564 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
565 //
566 // [[DO_COPY]]
567 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
568 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
569 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
570 //
571 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
572 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
573 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
574 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
575 //
576 // CHECK: [[COPY_ELSE]]
577 // CHECK: br label {{%?}}[[COPY_CONT]]
578 //
579 // Barrier after copy to shared memory storage medium.
580 // CHECK: [[COPY_CONT]]
581 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
582 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
583 //
584 // Read into warp 0.
585 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
586 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
587 //
588 // CHECK: [[DO_READ]]
589 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
590 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
591 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
592 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
593 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
594 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
595 // CHECK: br label {{%?}}[[READ_CONT:.+]]
596 //
597 // CHECK: [[READ_ELSE]]
598 // CHECK: br label {{%?}}[[READ_CONT]]
599 //
600 // CHECK: [[READ_CONT]]
601 // CHECK: ret
602
603 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*)
604 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
605 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
606 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
607 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
608 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
609 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
610 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
611 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
612 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
613 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
614 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
615 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
616 // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
617 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
618 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
619 // CHECK: [[LOC_RED1:%.+]] = load i8, i8* [[RL_RED1]],
620 // CHECK: store i8 [[LOC_RED1]], i8* [[GLOBAL_RED1_IDX_PTR]],
621 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
622 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
623 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float*
624 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
625 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
626 // CHECK: [[LOC_RED1:%.+]] = load float, float* [[RL_RED1]],
627 // CHECK: store float [[LOC_RED1]], float* [[GLOBAL_RED1_IDX_PTR]],
628 // CHECK: ret void
629
630 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*)
631 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
632 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
633 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
634 // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
635 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
636 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
637 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
638 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
639 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
640 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
641 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
642 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
643 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
644 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]]
645 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
646 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
647 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
648 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8*
649 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
650 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
651 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
652 // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
653 // CHECK: ret void
654
655 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*)
656 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
657 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
658 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
659 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
660 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
661 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
662 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
663 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
664 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
665 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
666 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
667 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
668 // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
669 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
670 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
671 // CHECK: [[GLOBAL_RED1:%.+]] = load i8, i8* [[GLOBAL_RED1_IDX_PTR]],
672 // CHECK: store i8 [[GLOBAL_RED1]], i8* [[RL_RED1]],
673 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
674 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
675 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float*
676 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
677 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
678 // CHECK: [[GLOBAL_RED1:%.+]] = load float, float* [[GLOBAL_RED1_IDX_PTR]],
679 // CHECK: store float [[GLOBAL_RED1]], float* [[RL_RED1]],
680 // CHECK: ret void
681
682 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*)
683 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
684 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
685 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
686 // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
687 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
688 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
689 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
690 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
691 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]*
692 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
693 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
694 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
695 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
696 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]]
697 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
698 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
699 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
700 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8*
701 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
702 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
703 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
704 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
705 // CHECK: ret void
706
707 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l54}}(
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000708 //
709 // CHECK: call void @__kmpc_spmd_kernel_init(
710 // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
Alexey Bataev8c5555c2019-05-21 15:11:58 +0000711 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
712
Alexey Bataev8061acd2019-02-20 16:36:22 +0000713 // CHECK-NOT: call void @__kmpc_get_team_static_memory
714 // CHECK: store i32 0,
715 // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align
716 // CHECK: store i16 -32768, i16* [[B_ADDR:%.+]], align
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000717 // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
Alexey Bataev8061acd2019-02-20 16:36:22 +0000718 // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
719 // CHECK: [[BC:%.+]] = bitcast i32* [[A_ADDR]] to i8*
720 // CHECK: store i8* [[BC]], i8** [[GEP1]],
721 // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
722 // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8*
723 // CHECK: store i8* [[BC]], i8** [[GEP2]],
724 // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
Alexey Bataev8c5555c2019-05-21 15:11:58 +0000725 // CHECK: [[BUF:%.+]] = load i8*, i8** @
726 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* [[BUF]], i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]])
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000727 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
728 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
729 //
730 // CHECK: [[IFLABEL]]
731 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
732 // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
733 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
734 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
735 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
736 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
737 // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
738 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
739 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
740 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
741 //
742 // CHECK: [[DO_MAX]]
743 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
744 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
745 //
746 // CHECK: [[MAX_ELSE]]
747 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
748 // CHECK: br label {{%?}}[[MAX_CONT]]
749 //
750 // CHECK: [[MAX_CONT]]
751 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
752 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
Alexey Bataev8061acd2019-02-20 16:36:22 +0000753 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]])
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000754 // CHECK: br label %[[EXIT]]
755 //
756 // CHECK: [[EXIT]]
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000757
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000758 // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}})
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000759 //
760 // CHECK: store i32 0, i32* [[A:%.+]], align
761 // CHECK: store i16 -32768, i16* [[B:%.+]], align
762 // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align
763 // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1
764 // CHECK: store i32 [[OR]], i32* [[A]], align
765 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
766 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
767 // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]]
768 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
769 //
770 // CHECK: [[DO_MAX]]
771 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
772 //
773 // CHECK: [[MAX_ELSE]]
774 // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align
775 // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32
776 // CHECK: br label {{%?}}[[MAX_CONT]]
777 //
778 // CHECK: [[MAX_CONT]]
779 // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ]
780 // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16
781 // CHECK: store i16 [[TRUNC]], i16* [[B]], align
Alexey Bataeva1166022018-11-27 21:24:54 +0000782 // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{.+}} 0, i[[SZ:.+]] 0
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000783 // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8*
784 // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
785 // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
786 // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
787 // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
788 // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
Alexey Bataev8e009032019-01-04 17:25:09 +0000789 // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* [[LOC]], i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000790 // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
791 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
792 //
793 // CHECK: [[IFLABEL]]
794 // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
795 // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
796 // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
797 // CHECK: store i32 [[OR]], i32* [[A_IN]], align
798 // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
799 // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
800 // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align
801 // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
802 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
803 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
804 //
805 // CHECK: [[DO_MAX]]
806 // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
807 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
808 //
809 // CHECK: [[MAX_ELSE]]
810 // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align
811 // CHECK: br label {{%?}}[[MAX_CONT]]
812 //
813 // CHECK: [[MAX_CONT]]
814 // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
815 // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
816 // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
817 // CHECK: br label %[[EXIT]]
818 //
819 // CHECK: [[EXIT]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000820 // CHECK: ret void
821
822 //
823 // Reduction function
824 // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*)
825 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
826 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
827 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
828 //
829 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
830 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
831 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
832 //
833 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
834 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
835 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
836 //
837 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
838 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
839 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
840 //
841 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
842 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
843 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
844 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
845 //
846 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
847 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
848 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
849 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
850 //
851 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
852 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
853 //
854 // CHECK: [[DO_MAX]]
855 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
856 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
857 //
858 // CHECK: [[MAX_ELSE]]
859 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
860 // CHECK: br label {{%?}}[[MAX_CONT]]
861 //
862 // CHECK: [[MAX_CONT]]
863 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
864 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
865 // CHECK: ret void
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000866 //
867 // Shuffle and reduce function
868 // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
869 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
870 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
871 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
872 //
873 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
874 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
875 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
876 //
877 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
878 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
879 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
880 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
881 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
882 //
883 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
884 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
885 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
886 //
887 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
888 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
889 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
890 //
891 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
892 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
893 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
894 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
895 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
896 //
897 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
898 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
899 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
900 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
901 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
902 //
903 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
904 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
905 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
906 //
907 // Condition to reduce
908 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
909 //
910 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
911 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
912 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
913 //
914 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
915 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
916 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
917 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
918 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
919 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
920 //
921 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
922 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
923 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
924 //
925 // CHECK: [[DO_REDUCE]]
926 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
927 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
928 // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
929 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
930 //
931 // CHECK: [[REDUCE_ELSE]]
932 // CHECK: br label {{%?}}[[REDUCE_CONT]]
933 //
934 // CHECK: [[REDUCE_CONT]]
935 // Now check if we should just copy over the remote reduction list
936 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
937 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
938 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
939 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
940 //
941 // CHECK: [[DO_COPY]]
942 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
943 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
944 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
945 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
946 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
947 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
948 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
949 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
950 //
951 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
952 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
953 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
954 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
955 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
956 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
957 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
958 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
959 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
960 //
961 // CHECK: [[COPY_ELSE]]
962 // CHECK: br label {{%?}}[[COPY_CONT]]
963 //
964 // CHECK: [[COPY_CONT]]
965 // CHECK: void
966
967 //
968 // Inter warp copy function
969 // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32)
970 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
971 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
972 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
973 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
974 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
975 //
976 // [[DO_COPY]]
977 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
978 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
979 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
980 //
981 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
982 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
983 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
984 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
985 //
986 // CHECK: [[COPY_ELSE]]
987 // CHECK: br label {{%?}}[[COPY_CONT]]
988 //
989 // Barrier after copy to shared memory storage medium.
990 // CHECK: [[COPY_CONT]]
Alexey Bataevae51b962018-12-14 21:00:58 +0000991 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
992 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
Alexey Bataevf2f39be2018-11-16 19:38:21 +0000993 //
994 // Read into warp 0.
995 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
996 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
997 //
998 // CHECK: [[DO_READ]]
999 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1000 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1001 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1002 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1003 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1004 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1005 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1006 //
1007 // CHECK: [[READ_ELSE]]
1008 // CHECK: br label {{%?}}[[READ_CONT]]
1009 //
1010 // CHECK: [[READ_CONT]]
Alexey Bataevae51b962018-12-14 21:00:58 +00001011 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001012 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1013 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1014 //
1015 // [[DO_COPY]]
1016 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1017 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1018 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1019 //
1020 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1021 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1022 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1023 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1024 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1025 //
1026 // CHECK: [[COPY_ELSE]]
1027 // CHECK: br label {{%?}}[[COPY_CONT]]
1028 //
1029 // Barrier after copy to shared memory storage medium.
1030 // CHECK: [[COPY_CONT]]
Alexey Bataevae51b962018-12-14 21:00:58 +00001031 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1032 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001033 //
1034 // Read into warp 0.
1035 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1036 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1037 //
1038 // CHECK: [[DO_READ]]
1039 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1040 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1041 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
1042 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1043 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1044 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1045 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1046 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1047 //
1048 // CHECK: [[READ_ELSE]]
1049 // CHECK: br label {{%?}}[[READ_CONT]]
1050 //
1051 // CHECK: [[READ_CONT]]
Alexey Bataevf2f39be2018-11-16 19:38:21 +00001052 // CHECK: ret
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001053
Alexey Bataev8061acd2019-02-20 16:36:22 +00001054 //
1055 // Reduction function
1056 // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
1057 // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1058 // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
1059 // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
1060 //
1061 // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
1062 // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
1063 // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
1064 //
1065 // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
1066 // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
1067 // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
1068 //
1069 // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
1070 // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
1071 // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
1072 //
1073 // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
1074 // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
1075 // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
1076 // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
1077 //
1078 // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
1079 // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
1080 // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
1081 // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
1082 //
1083 // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
1084 // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
1085 //
1086 // CHECK: [[DO_MAX]]
1087 // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
1088 // CHECK: br label {{%?}}[[MAX_CONT:.+]]
1089 //
1090 // CHECK: [[MAX_ELSE]]
1091 // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
1092 // CHECK: br label {{%?}}[[MAX_CONT]]
1093 //
1094 // CHECK: [[MAX_CONT]]
1095 // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
1096 // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
1097 // CHECK: ret void
1098
1099 //
1100 // Shuffle and reduce function
1101 // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
1102 // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align
1103 // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
1104 // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
1105 //
1106 // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
1107 // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
1108 // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
1109 //
1110 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1111 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1112 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
1113 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1114 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1115 //
1116 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1117 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1118 // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
1119 //
1120 // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
1121 // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
1122 // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
1123 //
1124 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1125 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1126 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1127 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1128 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1129 //
1130 // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
1131 // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1132 // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
1133 // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
1134 // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
1135 //
1136 // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
1137 // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
1138 // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
1139 //
1140 // Condition to reduce
1141 // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
1142 //
1143 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1144 // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
1145 // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
1146 //
1147 // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
1148 // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
1149 // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
1150 // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
1151 // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
1152 // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
1153 //
1154 // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
1155 // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
1156 // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
1157 //
1158 // CHECK: [[DO_REDUCE]]
1159 // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8*
1160 // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8*
1161 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
1162 // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
1163 //
1164 // CHECK: [[REDUCE_ELSE]]
1165 // CHECK: br label {{%?}}[[REDUCE_CONT]]
1166 //
1167 // CHECK: [[REDUCE_CONT]]
1168 // Now check if we should just copy over the remote reduction list
1169 // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
1170 // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
1171 // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
1172 // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1173 //
1174 // CHECK: [[DO_COPY]]
1175 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1176 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1177 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
1178 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1179 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
1180 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1181 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
1182 // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
1183 //
1184 // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1185 // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
1186 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
1187 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1188 // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
1189 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1190 // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
1191 // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
1192 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1193 //
1194 // CHECK: [[COPY_ELSE]]
1195 // CHECK: br label {{%?}}[[COPY_CONT]]
1196 //
1197 // CHECK: [[COPY_CONT]]
1198 // CHECK: void
1199
1200 //
1201 // Inter warp copy function
1202 // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32)
1203 // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
1204 // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
1205 // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
1206 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1207 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1208 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1209 //
1210 // [[DO_COPY]]
1211 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
1212 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1213 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1214 //
1215 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1216 // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
1217 // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1218 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1219 //
1220 // CHECK: [[COPY_ELSE]]
1221 // CHECK: br label {{%?}}[[COPY_CONT]]
1222 //
1223 // Barrier after copy to shared memory storage medium.
1224 // CHECK: [[COPY_CONT]]
1225 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1226 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
1227 //
1228 // Read into warp 0.
1229 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1230 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1231 //
1232 // CHECK: [[DO_READ]]
1233 // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1234 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
1235 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1236 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
1237 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1238 // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
1239 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1240 //
1241 // CHECK: [[READ_ELSE]]
1242 // CHECK: br label {{%?}}[[READ_CONT]]
1243 //
1244 // CHECK: [[READ_CONT]]
1245 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1246 // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
1247 // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
1248 //
1249 // [[DO_COPY]]
1250 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
1251 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1252 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1253 //
1254 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
1255 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1256 // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
1257 // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1258 // CHECK: br label {{%?}}[[COPY_CONT:.+]]
1259 //
1260 // CHECK: [[COPY_ELSE]]
1261 // CHECK: br label {{%?}}[[COPY_CONT]]
1262 //
1263 // Barrier after copy to shared memory storage medium.
1264 // CHECK: [[COPY_CONT]]
1265 // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
1266 // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32*
1267 //
1268 // Read into warp 0.
1269 // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
1270 // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
1271 //
1272 // CHECK: [[DO_READ]]
1273 // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
1274 // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
1275 // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
1276 // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
1277 // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
1278 // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
1279 // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
1280 // CHECK: br label {{%?}}[[READ_CONT:.+]]
1281 //
1282 // CHECK: [[READ_ELSE]]
1283 // CHECK: br label {{%?}}[[READ_CONT]]
1284 //
1285 // CHECK: [[READ_CONT]]
1286 // CHECK: ret
1287
1288 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*)
1289 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1290 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1291 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1292 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1293 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1294 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1295 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1296 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
1297 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1298 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1299 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1300 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1301 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1302 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32*
1303 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1304 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1305 // CHECK: [[LOC_RED1:%.+]] = load i32, i32* [[RL_RED1]],
1306 // CHECK: store i32 [[LOC_RED1]], i32* [[GLOBAL_RED1_IDX_PTR]],
1307 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1308 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1309 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16*
1310 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1311 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1312 // CHECK: [[LOC_RED1:%.+]] = load i16, i16* [[RL_RED1]],
1313 // CHECK: store i16 [[LOC_RED1]], i16* [[GLOBAL_RED1_IDX_PTR]],
1314 // CHECK: ret void
1315
1316 // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*)
1317 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1318 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1319 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1320 // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
1321 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1322 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1323 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1324 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1325 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1326 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1327 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1328 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1329 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1330 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8*
1331 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1332 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1333 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1334 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1335 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8*
1336 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1337 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
1338 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1339 // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]])
1340 // CHECK: ret void
1341
1342 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*)
1343 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1344 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1345 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1346 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1347 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1348 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1349 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1350 // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]*
1351 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1352 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1353 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1354 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1355 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1356 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32*
1357 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1358 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1359 // CHECK: [[GLOBAL_RED1:%.+]] = load i32, i32* [[GLOBAL_RED1_IDX_PTR]],
1360 // CHECK: store i32 [[GLOBAL_RED1]], i32* [[RL_RED1]],
1361 // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1362 // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]],
1363 // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16*
1364 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1365 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1366 // CHECK: [[GLOBAL_RED1:%.+]] = load i16, i16* [[GLOBAL_RED1_IDX_PTR]],
1367 // CHECK: store i16 [[GLOBAL_RED1]], i16* [[RL_RED1]],
1368 // CHECK: ret void
1369
1370 // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*)
1371 // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*,
1372 // CHECK: [[IDX_PTR:%.+]] = alloca i32,
1373 // CHECK: [[RL_PTR:%.+]] = alloca i8*,
1374 // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*],
1375 // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]],
1376 // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]],
1377 // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]],
1378 // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]],
1379 // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]*
1380 // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]],
1381 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1382 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
1383 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1384 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8*
1385 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1386 // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1387 // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
1388 // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]]
1389 // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8*
1390 // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]]
1391 // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8*
1392 // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]],
1393 // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
1394 // CHECK: ret void
1395
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001396#endif