blob: 9d0b6b395996b56ddff7dd92c2971878627e689f [file] [log] [blame]
Matt Arsenaultb8f8dbc2017-03-24 19:52:05 +00001; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; Add an extra verifier runs. There were some cases where invalid IR
5; was produced but happened to be fixed by the later passes.
6
7; Make sure divergent control flow with multiple exits from a region
8; is properly handled. UnifyFunctionExitNodes should be run before
9; StructurizeCFG.
10
11; IR-LABEL: @multi_divergent_region_exit_ret_ret(
12; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
13; IR: %2 = extractvalue { i1, i64 } %1, 0
14; IR: %3 = extractvalue { i1, i64 } %1, 1
15; IR: br i1 %2, label %LeafBlock1, label %Flow
16
17; IR: Flow:
18; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
19; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
20; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
21; IR: %7 = extractvalue { i1, i64 } %6, 0
22; IR: %8 = extractvalue { i1, i64 } %6, 1
23; IR: br i1 %7, label %LeafBlock, label %Flow1
24
25; IR: LeafBlock:
26; IR: br label %Flow1
27
28; IR: LeafBlock1:
29; IR: br label %Flow{{$}}
30
31; IR: Flow2:
32; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
33; IR: call void @llvm.amdgcn.end.cf(i64 %19)
34; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
35; IR: %13 = extractvalue { i1, i64 } %12, 0
36; IR: %14 = extractvalue { i1, i64 } %12, 1
37; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
38
39; IR: exit0:
40; IR: store volatile i32 9, i32 addrspace(1)* undef
41; IR: br label %UnifiedReturnBlock
42
43; IR: Flow1:
44; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
45; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
46; IR: call void @llvm.amdgcn.end.cf(i64 %8)
47; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
48; IR: %18 = extractvalue { i1, i64 } %17, 0
49; IR: %19 = extractvalue { i1, i64 } %17, 1
50; IR: br i1 %18, label %exit1, label %Flow2
51
52; IR: exit1:
53; IR: store volatile i32 17, i32 addrspace(3)* undef
54; IR: br label %Flow2
55
56; IR: UnifiedReturnBlock:
57; IR: call void @llvm.amdgcn.end.cf(i64 %14)
58; IR: ret void
59
60
61; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
62; GCN: v_cmp_lt_i32_e32 vcc, 1
63; GCN: s_and_saveexec_b64
64; GCN: s_xor_b64
65
66
67; FIXME: Why is this compare essentially repeated?
68; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
69; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
70; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
71; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
72
73; GCN: ; %Flow1
74; GCN-NEXT: s_or_b64 exec, exec
75; GCN: v_cmp_ne_u32_e32 vcc, 0
76
77; GCN: ; %exit1
78; GCN: ds_write_b32
79
80; GCN: %Flow2
81; GCN-NEXT: s_or_b64 exec, exec
82; GCN: v_cmp_ne_u32_e32 vcc, 0
83; GCN-NEXT: s_and_saveexec_b64
84; GCN-NEXT: s_xor_b64
85
86; GCN: ; %exit0
87; GCN: buffer_store_dword
88
89; GCN: ; %UnifiedReturnBlock
90; GCN-NEXT: s_or_b64 exec, exec
91; GCN-NEXT: s_endpgm
92define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
93entry:
94 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
95 %tmp1 = add i32 0, %tmp
96 %tmp2 = zext i32 %tmp1 to i64
97 %tmp3 = add i64 0, %tmp2
98 %tmp4 = shl i64 %tmp3, 32
99 %tmp5 = ashr exact i64 %tmp4, 32
100 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
101 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
102 %tmp8 = sext i32 %tmp7 to i64
103 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
104 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
105 %tmp13 = zext i32 %tmp10 to i64
106 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
107 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
108 %Pivot = icmp slt i32 %tmp16, 2
109 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
110
111LeafBlock: ; preds = %entry
112 %SwitchLeaf = icmp eq i32 %tmp16, 1
113 br i1 %SwitchLeaf, label %exit0, label %exit1
114
115LeafBlock1: ; preds = %entry
116 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
117 br i1 %SwitchLeaf2, label %exit0, label %exit1
118
119exit0: ; preds = %LeafBlock, %LeafBlock1
120 store volatile i32 9, i32 addrspace(1)* undef
121 ret void
122
123exit1: ; preds = %LeafBlock, %LeafBlock1
124 store volatile i32 17, i32 addrspace(3)* undef
125 ret void
126}
127
128; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
129; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
130
131; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
132
133; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
134; IR: call void @llvm.amdgcn.end.cf(i64 %19)
135; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
136; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
137
138
139; IR: UnifiedUnreachableBlock:
140; IR-NEXT: unreachable
141
142
143; FIXME: Probably should insert an s_endpgm anyway.
144; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
145; GCN: ; %UnifiedUnreachableBlock
146; GCN-NEXT: .Lfunc_end
147define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
148entry:
149 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
150 %tmp1 = add i32 0, %tmp
151 %tmp2 = zext i32 %tmp1 to i64
152 %tmp3 = add i64 0, %tmp2
153 %tmp4 = shl i64 %tmp3, 32
154 %tmp5 = ashr exact i64 %tmp4, 32
155 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
156 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
157 %tmp8 = sext i32 %tmp7 to i64
158 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
159 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
160 %tmp13 = zext i32 %tmp10 to i64
161 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
162 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
163 %Pivot = icmp slt i32 %tmp16, 2
164 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
165
166LeafBlock: ; preds = %entry
167 %SwitchLeaf = icmp eq i32 %tmp16, 1
168 br i1 %SwitchLeaf, label %exit0, label %exit1
169
170LeafBlock1: ; preds = %entry
171 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
172 br i1 %SwitchLeaf2, label %exit0, label %exit1
173
174exit0: ; preds = %LeafBlock, %LeafBlock1
175 store volatile i32 9, i32 addrspace(1)* undef
176 unreachable
177
178exit1: ; preds = %LeafBlock, %LeafBlock1
179 store volatile i32 17, i32 addrspace(3)* undef
180 unreachable
181}
182
183; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
184; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
185; IR: llvm.amdgcn.if
186; IR: br i1
187
188; IR: {{^}}Flow:
189; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
190; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
191; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
192; IR: br i1 %7, label %LeafBlock, label %Flow1
193
194; IR: {{^}}LeafBlock:
195; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
196; IR: %9 = xor i1 %divergent.cond1, true
197; IR: br label %Flow1
198
199; IR: LeafBlock1:
200; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
201; IR: %10 = xor i1 %uniform.cond0, true
202; IR: br label %Flow
203
204; IR: Flow2:
205; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
206; IR: call void @llvm.amdgcn.end.cf(i64 %19)
207; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
208; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
209
210; IR: exit0:
211; IR: store volatile i32 9, i32 addrspace(1)* undef
212; IR: br label %UnifiedReturnBlock
213
214; IR: {{^}}Flow1:
215; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
216; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
217; IR: call void @llvm.amdgcn.end.cf(i64 %8)
218; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
219; IR: %18 = extractvalue { i1, i64 } %17, 0
220; IR: %19 = extractvalue { i1, i64 } %17, 1
221; IR: br i1 %18, label %exit1, label %Flow2
222
223; IR: exit1:
224; IR: store volatile i32 17, i32 addrspace(3)* undef
225; IR: br label %Flow2
226
227; IR: UnifiedReturnBlock:
228; IR: call void @llvm.amdgcn.end.cf(i64 %14)
229; IR: ret void
230define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
231entry:
232 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
233 %tmp1 = add i32 0, %tmp
234 %tmp2 = zext i32 %tmp1 to i64
235 %tmp3 = add i64 0, %tmp2
236 %tmp4 = shl i64 %tmp3, 32
237 %tmp5 = ashr exact i64 %tmp4, 32
238 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
239 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
240 %tmp8 = sext i32 %tmp7 to i64
241 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
242 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
243 %tmp13 = zext i32 %tmp10 to i64
244 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
245 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
246 %divergent.cond0 = icmp slt i32 %tmp16, 2
247 br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
248
249LeafBlock: ; preds = %entry
250 %divergent.cond1 = icmp eq i32 %tmp16, 1
251 br i1 %divergent.cond1, label %exit0, label %exit1
252
253LeafBlock1: ; preds = %entry
254 %uniform.cond0 = icmp eq i32 %arg3, 2
255 br i1 %uniform.cond0, label %exit0, label %exit1
256
257exit0: ; preds = %LeafBlock, %LeafBlock1
258 store volatile i32 9, i32 addrspace(1)* undef
259 ret void
260
261exit1: ; preds = %LeafBlock, %LeafBlock1
262 store volatile i32 17, i32 addrspace(3)* undef
263 ret void
264}
265
266; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
267; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
268; IR: br i1 %2, label %LeafBlock1, label %Flow
269
270; IR: Flow:
271; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
272; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
273; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
274
275; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
276; IR: call void @llvm.amdgcn.end.cf(i64 %19)
277; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
278
279define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
280entry:
281 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
282 %tmp1 = add i32 0, %tmp
283 %tmp2 = zext i32 %tmp1 to i64
284 %tmp3 = add i64 0, %tmp2
285 %tmp4 = shl i64 %tmp3, 32
286 %tmp5 = ashr exact i64 %tmp4, 32
287 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
288 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
289 %tmp8 = sext i32 %tmp7 to i64
290 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
291 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
292 %tmp13 = zext i32 %tmp10 to i64
293 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
294 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
295 %Pivot = icmp slt i32 %tmp16, 2
296 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
297
298LeafBlock: ; preds = %entry
299 %SwitchLeaf = icmp eq i32 %arg3, 1
300 br i1 %SwitchLeaf, label %exit0, label %exit1
301
302LeafBlock1: ; preds = %entry
303 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
304 br i1 %SwitchLeaf2, label %exit0, label %exit1
305
306exit0: ; preds = %LeafBlock, %LeafBlock1
307 store volatile i32 9, i32 addrspace(1)* undef
308 ret void
309
310exit1: ; preds = %LeafBlock, %LeafBlock1
311 store volatile i32 17, i32 addrspace(3)* undef
312 ret void
313}
314
315; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
316; IR: Flow2:
317; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
318; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
319; IR: call void @llvm.amdgcn.end.cf(i64 %20)
320
321; IR: UnifiedReturnBlock:
322; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
323; IR: call void @llvm.amdgcn.end.cf(i64 %15)
324; IR: ret float %UnifiedRetVal
325define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
326entry:
327 %Pivot = icmp slt i32 %vgpr, 2
328 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
329
330LeafBlock: ; preds = %entry
331 %SwitchLeaf = icmp eq i32 %vgpr, 1
332 br i1 %SwitchLeaf, label %exit0, label %exit1
333
334LeafBlock1: ; preds = %entry
335 %SwitchLeaf2 = icmp eq i32 %vgpr, 2
336 br i1 %SwitchLeaf2, label %exit0, label %exit1
337
338exit0: ; preds = %LeafBlock, %LeafBlock1
339 store i32 9, i32 addrspace(1)* undef
340 ret float 1.0
341
342exit1: ; preds = %LeafBlock, %LeafBlock1
343 store i32 17, i32 addrspace(3)* undef
344 ret float 2.0
345}
346
347; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
348
349; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
350; GCN: s_cmp_gt_i32 s0, 1
351; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
352
353; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
354
355; GCN: {{^}}[[FLOW]]:
356; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
357
358; GCN: v_mov_b32_e32 v0, 2.0
359; GCN: s_or_b64 exec, exec
360; GCN: s_and_b64 exec, exec
361; GCN: v_mov_b32_e32 v0, 1.0
362
363; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
364; GCN-NEXT: s_or_b64 exec, exec
365; GCN-NEXT: ; return
366
367define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
368entry:
369 %uniform.cond = icmp slt i32 %sgpr, 2
370 br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
371
372LeafBlock: ; preds = %entry
373 %divergent.cond0 = icmp eq i32 %vgpr, 3
374 br i1 %divergent.cond0, label %exit0, label %exit1
375
376LeafBlock1: ; preds = %entry
377 %divergent.cond1 = icmp eq i32 %vgpr, 7
378 br i1 %divergent.cond1, label %exit0, label %exit1
379
380exit0: ; preds = %LeafBlock, %LeafBlock1
381 store i32 9, i32 addrspace(1)* undef
382 ret float 1.0
383
384exit1: ; preds = %LeafBlock, %LeafBlock1
385 store i32 17, i32 addrspace(3)* undef
386 ret float 2.0
387}
388
389; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
390; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
391
392; IR: Flow:
393; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
394; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
395; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
396
397; IR: Flow2:
398; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
399; IR: call void @llvm.amdgcn.end.cf(i64 %19)
400; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
401; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
402
403; IR: exit0:
404; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
405; IR-NEXT: br label %UnifiedReturnBlock
406
407; IR: Flow1:
408; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
409; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
410; IR: call void @llvm.amdgcn.end.cf(i64 %8)
411; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
412; IR: %18 = extractvalue { i1, i64 } %17, 0
413; IR: %19 = extractvalue { i1, i64 } %17, 1
414; IR: br i1 %18, label %exit1, label %Flow2
415
416; IR: exit1:
417; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
418; IR-NEXT: call void @llvm.amdgcn.unreachable()
419; IR-NEXT: br label %Flow2
420
421; IR: UnifiedReturnBlock:
422; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
423; IR-NEXT: ret void
424define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
425entry:
426 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
427 %tmp1 = add i32 0, %tmp
428 %tmp2 = zext i32 %tmp1 to i64
429 %tmp3 = add i64 0, %tmp2
430 %tmp4 = shl i64 %tmp3, 32
431 %tmp5 = ashr exact i64 %tmp4, 32
432 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
433 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
434 %tmp8 = sext i32 %tmp7 to i64
435 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
436 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
437 %tmp13 = zext i32 %tmp10 to i64
438 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
439 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
440 %Pivot = icmp slt i32 %tmp16, 2
441 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
442
443LeafBlock: ; preds = %entry
444 %SwitchLeaf = icmp eq i32 %tmp16, 1
445 br i1 %SwitchLeaf, label %exit0, label %exit1
446
447LeafBlock1: ; preds = %entry
448 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
449 br i1 %SwitchLeaf2, label %exit0, label %exit1
450
451exit0: ; preds = %LeafBlock, %LeafBlock1
452 store volatile i32 17, i32 addrspace(3)* undef
453 ret void
454
455exit1: ; preds = %LeafBlock, %LeafBlock1
456 store volatile i32 9, i32 addrspace(1)* undef
457 unreachable
458}
459
460; The non-uniformity of the branch to the exiting blocks requires
461; looking at transitive predecessors.
462
463; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
464
465; IR: exit0: ; preds = %Flow2
466; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
467; IR-NEXT: br label %UnifiedReturnBlock
468
469
470; IR: indirect.exit1:
471; IR: %load = load volatile i32, i32 addrspace(1)* undef
472; IR: store volatile i32 %load, i32 addrspace(1)* undef
473; IR: store volatile i32 9, i32 addrspace(1)* undef
474; IR: call void @llvm.amdgcn.unreachable()
475; IR-NEXT: br label %Flow2
476
477; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
478; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
479; IR-NEXT: ret void
480define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
481entry:
482 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
483 %tmp1 = add i32 0, %tmp
484 %tmp2 = zext i32 %tmp1 to i64
485 %tmp3 = add i64 0, %tmp2
486 %tmp4 = shl i64 %tmp3, 32
487 %tmp5 = ashr exact i64 %tmp4, 32
488 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
489 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
490 %tmp8 = sext i32 %tmp7 to i64
491 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
492 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
493 %tmp13 = zext i32 %tmp10 to i64
494 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
495 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
496 %Pivot = icmp slt i32 %tmp16, 2
497 br i1 %Pivot, label %LeafBlock, label %LeafBlock1
498
499LeafBlock: ; preds = %entry
500 %SwitchLeaf = icmp eq i32 %tmp16, 1
501 br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
502
503LeafBlock1: ; preds = %entry
504 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
505 br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
506
507exit0: ; preds = %LeafBlock, %LeafBlock1
508 store volatile i32 17, i32 addrspace(3)* undef
509 ret void
510
511indirect.exit1:
512 %load = load volatile i32, i32 addrspace(1)* undef
513 store volatile i32 %load, i32 addrspace(1)* undef
514 br label %exit1
515
516exit1: ; preds = %LeafBlock, %LeafBlock1
517 store volatile i32 9, i32 addrspace(1)* undef
518 unreachable
519}
520
521; IR-LABEL: @multi_divergent_region_exit_ret_switch(
522define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
523entry:
524 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
525 %tmp1 = add i32 0, %tmp
526 %tmp2 = zext i32 %tmp1 to i64
527 %tmp3 = add i64 0, %tmp2
528 %tmp4 = shl i64 %tmp3, 32
529 %tmp5 = ashr exact i64 %tmp4, 32
530 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
531 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
532 %tmp8 = sext i32 %tmp7 to i64
533 %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
534 %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
535 %tmp13 = zext i32 %tmp10 to i64
536 %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
537 %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
538 switch i32 %tmp16, label %exit1
539 [ i32 1, label %LeafBlock
540 i32 2, label %LeafBlock1
541 i32 3, label %exit0 ]
542
543LeafBlock: ; preds = %entry
544 %SwitchLeaf = icmp eq i32 %tmp16, 1
545 br i1 %SwitchLeaf, label %exit0, label %exit1
546
547LeafBlock1: ; preds = %entry
548 %SwitchLeaf2 = icmp eq i32 %tmp16, 2
549 br i1 %SwitchLeaf2, label %exit0, label %exit1
550
551exit0: ; preds = %LeafBlock, %LeafBlock1
552 store volatile i32 17, i32 addrspace(3)* undef
553 ret void
554
555exit1: ; preds = %LeafBlock, %LeafBlock1
556 store volatile i32 9, i32 addrspace(1)* undef
557 unreachable
558}
559
560; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
561define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
562entry:
563 %uniform.cond0 = icmp eq i32 %arg0, 4
564 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
565
566divergent.multi.exit.region:
567 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
568 %divergent.cond0 = icmp eq i32 %id.x, 0
569 br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
570
571divergent.ret0:
572 store volatile i32 11, i32 addrspace(3)* undef
573 ret void
574
575divergent.ret1:
576 store volatile i32 42, i32 addrspace(3)* undef
577 ret void
578
579uniform.ret:
580 store volatile i32 9, i32 addrspace(1)* undef
581 ret void
582}
583
584; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
585define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
586entry:
587 %uniform.cond0 = icmp eq i32 %arg0, 4
588 br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
589
590divergent.multi.exit.region:
591 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
592 %divergent.cond0 = icmp eq i32 %id.x, 0
593 br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
594
595divergent.if:
596 %vgpr0 = load volatile float, float addrspace(1)* undef
597 %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
598 br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
599
600divergent.then:
601 %vgpr1 = load volatile float, float addrspace(1)* undef
602 %divergent.cond2 = fcmp olt float %vgpr1, 4.0
603 store volatile i32 33, i32 addrspace(1)* undef
604 br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
605
606divergent.endif:
607 store volatile i32 38, i32 addrspace(1)* undef
608 br label %divergent.ret0
609
610divergent.ret0:
611 store volatile i32 11, i32 addrspace(3)* undef
612 ret void
613
614divergent.ret1:
615 store volatile i32 42, i32 addrspace(3)* undef
616 ret void
617
618uniform.ret:
619 store volatile i32 9, i32 addrspace(1)* undef
620 ret void
621}
622
623; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
624; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
625; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
626; IR: br i1 %8, label %uniform.if, label %Flow2
627
628; IR: Flow: ; preds = %uniform.then, %uniform.if
629; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
630; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
631
632; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
633; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
634; IR-NEXT: ret void
635define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
636entry:
637 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
638 %divergent.cond0 = icmp eq i32 %id.x, 0
639 br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
640
641uniform.multi.exit.region:
642 %uniform.cond0 = icmp eq i32 %arg0, 4
643 br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
644
645uniform.if:
646 %sgpr0 = load volatile i32, i32 addrspace(2)* undef
647 %uniform.cond1 = icmp slt i32 %sgpr0, 1
648 br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
649
650uniform.then:
651 %sgpr1 = load volatile i32, i32 addrspace(2)* undef
652 %uniform.cond2 = icmp sge i32 %sgpr1, 4
653 store volatile i32 33, i32 addrspace(1)* undef
654 br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
655
656uniform.endif:
657 store volatile i32 38, i32 addrspace(1)* undef
658 br label %uniform.ret0
659
660uniform.ret0:
661 store volatile i32 11, i32 addrspace(3)* undef
662 ret void
663
664uniform.ret1:
665 store volatile i32 42, i32 addrspace(3)* undef
666 ret void
667
668divergent.ret:
669 store volatile i32 9, i32 addrspace(1)* undef
670 ret void
671}
672
673; IR-LABEL: @multi_divergent_unreachable_exit(
674; IR: UnifiedUnreachableBlock:
675; IR-NEXT: call void @llvm.amdgcn.unreachable()
676; IR-NEXT: br label %UnifiedReturnBlock
677
678; IR: UnifiedReturnBlock:
679; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
680; IR-NEXT: ret void
681define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
682bb:
683 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
684 switch i32 %tmp, label %bb3 [
685 i32 2, label %bb1
686 i32 0, label %bb2
687 ]
688
689bb1: ; preds = %bb
690 unreachable
691
692bb2: ; preds = %bb
693 unreachable
694
695bb3: ; preds = %bb
696 switch i32 undef, label %bb5 [
697 i32 2, label %bb4
698 ]
699
700bb4: ; preds = %bb3
701 ret void
702
703bb5: ; preds = %bb3
704 unreachable
705}
706
707declare i32 @llvm.amdgcn.workitem.id.x() #1
708
709attributes #0 = { nounwind }
710attributes #1 = { nounwind readnone }