Blame - llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll - toolchain/llvm-project

blob: 9d0b6b395996b56ddff7dd92c2971878627e689f [file] [log] [blame]

Matt Arsenault	b8f8dbc	2017-03-24 19:52:05 +0000	[diff] [blame^]	1	; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s \| FileCheck -check-prefix=IR %s
				2	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
				3
				4	; Add an extra verifier runs. There were some cases where invalid IR
				5	; was produced but happened to be fixed by the later passes.
				6
				7	; Make sure divergent control flow with multiple exits from a region
				8	; is properly handled. UnifyFunctionExitNodes should be run before
				9	; StructurizeCFG.
				10
				11	; IR-LABEL: @multi_divergent_region_exit_ret_ret(
				12	; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
				13	; IR: %2 = extractvalue { i1, i64 } %1, 0
				14	; IR: %3 = extractvalue { i1, i64 } %1, 1
				15	; IR: br i1 %2, label %LeafBlock1, label %Flow
				16
				17	; IR: Flow:
				18	; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
				19	; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
				20	; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
				21	; IR: %7 = extractvalue { i1, i64 } %6, 0
				22	; IR: %8 = extractvalue { i1, i64 } %6, 1
				23	; IR: br i1 %7, label %LeafBlock, label %Flow1
				24
				25	; IR: LeafBlock:
				26	; IR: br label %Flow1
				27
				28	; IR: LeafBlock1:
				29	; IR: br label %Flow{{$}}
				30
				31	; IR: Flow2:
				32	; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
				33	; IR: call void @llvm.amdgcn.end.cf(i64 %19)
				34	; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
				35	; IR: %13 = extractvalue { i1, i64 } %12, 0
				36	; IR: %14 = extractvalue { i1, i64 } %12, 1
				37	; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
				38
				39	; IR: exit0:
				40	; IR: store volatile i32 9, i32 addrspace(1)* undef
				41	; IR: br label %UnifiedReturnBlock
				42
				43	; IR: Flow1:
				44	; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
				45	; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
				46	; IR: call void @llvm.amdgcn.end.cf(i64 %8)
				47	; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
				48	; IR: %18 = extractvalue { i1, i64 } %17, 0
				49	; IR: %19 = extractvalue { i1, i64 } %17, 1
				50	; IR: br i1 %18, label %exit1, label %Flow2
				51
				52	; IR: exit1:
				53	; IR: store volatile i32 17, i32 addrspace(3)* undef
				54	; IR: br label %Flow2
				55
				56	; IR: UnifiedReturnBlock:
				57	; IR: call void @llvm.amdgcn.end.cf(i64 %14)
				58	; IR: ret void
				59
				60
				61	; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
				62	; GCN: v_cmp_lt_i32_e32 vcc, 1
				63	; GCN: s_and_saveexec_b64
				64	; GCN: s_xor_b64
				65
				66
				67	; FIXME: Why is this compare essentially repeated?
				68	; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
				69	; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
				70	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
				71	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
				72
				73	; GCN: ; %Flow1
				74	; GCN-NEXT: s_or_b64 exec, exec
				75	; GCN: v_cmp_ne_u32_e32 vcc, 0
				76
				77	; GCN: ; %exit1
				78	; GCN: ds_write_b32
				79
				80	; GCN: %Flow2
				81	; GCN-NEXT: s_or_b64 exec, exec
				82	; GCN: v_cmp_ne_u32_e32 vcc, 0
				83	; GCN-NEXT: s_and_saveexec_b64
				84	; GCN-NEXT: s_xor_b64
				85
				86	; GCN: ; %exit0
				87	; GCN: buffer_store_dword
				88
				89	; GCN: ; %UnifiedReturnBlock
				90	; GCN-NEXT: s_or_b64 exec, exec
				91	; GCN-NEXT: s_endpgm
				92	define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
				93	entry:
				94	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				95	%tmp1 = add i32 0, %tmp
				96	%tmp2 = zext i32 %tmp1 to i64
				97	%tmp3 = add i64 0, %tmp2
				98	%tmp4 = shl i64 %tmp3, 32
				99	%tmp5 = ashr exact i64 %tmp4, 32
				100	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				101	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				102	%tmp8 = sext i32 %tmp7 to i64
				103	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				104	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				105	%tmp13 = zext i32 %tmp10 to i64
				106	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				107	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				108	%Pivot = icmp slt i32 %tmp16, 2
				109	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				110
				111	LeafBlock: ; preds = %entry
				112	%SwitchLeaf = icmp eq i32 %tmp16, 1
				113	br i1 %SwitchLeaf, label %exit0, label %exit1
				114
				115	LeafBlock1: ; preds = %entry
				116	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				117	br i1 %SwitchLeaf2, label %exit0, label %exit1
				118
				119	exit0: ; preds = %LeafBlock, %LeafBlock1
				120	store volatile i32 9, i32 addrspace(1)* undef
				121	ret void
				122
				123	exit1: ; preds = %LeafBlock, %LeafBlock1
				124	store volatile i32 17, i32 addrspace(3)* undef
				125	ret void
				126	}
				127
				128	; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
				129	; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
				130
				131	; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
				132
				133	; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
				134	; IR: call void @llvm.amdgcn.end.cf(i64 %19)
				135	; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
				136	; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
				137
				138
				139	; IR: UnifiedUnreachableBlock:
				140	; IR-NEXT: unreachable
				141
				142
				143	; FIXME: Probably should insert an s_endpgm anyway.
				144	; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
				145	; GCN: ; %UnifiedUnreachableBlock
				146	; GCN-NEXT: .Lfunc_end
				147	define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
				148	entry:
				149	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				150	%tmp1 = add i32 0, %tmp
				151	%tmp2 = zext i32 %tmp1 to i64
				152	%tmp3 = add i64 0, %tmp2
				153	%tmp4 = shl i64 %tmp3, 32
				154	%tmp5 = ashr exact i64 %tmp4, 32
				155	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				156	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				157	%tmp8 = sext i32 %tmp7 to i64
				158	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				159	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				160	%tmp13 = zext i32 %tmp10 to i64
				161	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				162	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				163	%Pivot = icmp slt i32 %tmp16, 2
				164	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				165
				166	LeafBlock: ; preds = %entry
				167	%SwitchLeaf = icmp eq i32 %tmp16, 1
				168	br i1 %SwitchLeaf, label %exit0, label %exit1
				169
				170	LeafBlock1: ; preds = %entry
				171	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				172	br i1 %SwitchLeaf2, label %exit0, label %exit1
				173
				174	exit0: ; preds = %LeafBlock, %LeafBlock1
				175	store volatile i32 9, i32 addrspace(1)* undef
				176	unreachable
				177
				178	exit1: ; preds = %LeafBlock, %LeafBlock1
				179	store volatile i32 17, i32 addrspace(3)* undef
				180	unreachable
				181	}
				182
				183	; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
				184	; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
				185	; IR: llvm.amdgcn.if
				186	; IR: br i1
				187
				188	; IR: {{^}}Flow:
				189	; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
				190	; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
				191	; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
				192	; IR: br i1 %7, label %LeafBlock, label %Flow1
				193
				194	; IR: {{^}}LeafBlock:
				195	; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
				196	; IR: %9 = xor i1 %divergent.cond1, true
				197	; IR: br label %Flow1
				198
				199	; IR: LeafBlock1:
				200	; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
				201	; IR: %10 = xor i1 %uniform.cond0, true
				202	; IR: br label %Flow
				203
				204	; IR: Flow2:
				205	; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
				206	; IR: call void @llvm.amdgcn.end.cf(i64 %19)
				207	; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
				208	; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
				209
				210	; IR: exit0:
				211	; IR: store volatile i32 9, i32 addrspace(1)* undef
				212	; IR: br label %UnifiedReturnBlock
				213
				214	; IR: {{^}}Flow1:
				215	; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
				216	; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
				217	; IR: call void @llvm.amdgcn.end.cf(i64 %8)
				218	; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
				219	; IR: %18 = extractvalue { i1, i64 } %17, 0
				220	; IR: %19 = extractvalue { i1, i64 } %17, 1
				221	; IR: br i1 %18, label %exit1, label %Flow2
				222
				223	; IR: exit1:
				224	; IR: store volatile i32 17, i32 addrspace(3)* undef
				225	; IR: br label %Flow2
				226
				227	; IR: UnifiedReturnBlock:
				228	; IR: call void @llvm.amdgcn.end.cf(i64 %14)
				229	; IR: ret void
				230	define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
				231	entry:
				232	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				233	%tmp1 = add i32 0, %tmp
				234	%tmp2 = zext i32 %tmp1 to i64
				235	%tmp3 = add i64 0, %tmp2
				236	%tmp4 = shl i64 %tmp3, 32
				237	%tmp5 = ashr exact i64 %tmp4, 32
				238	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				239	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				240	%tmp8 = sext i32 %tmp7 to i64
				241	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				242	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				243	%tmp13 = zext i32 %tmp10 to i64
				244	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				245	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				246	%divergent.cond0 = icmp slt i32 %tmp16, 2
				247	br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
				248
				249	LeafBlock: ; preds = %entry
				250	%divergent.cond1 = icmp eq i32 %tmp16, 1
				251	br i1 %divergent.cond1, label %exit0, label %exit1
				252
				253	LeafBlock1: ; preds = %entry
				254	%uniform.cond0 = icmp eq i32 %arg3, 2
				255	br i1 %uniform.cond0, label %exit0, label %exit1
				256
				257	exit0: ; preds = %LeafBlock, %LeafBlock1
				258	store volatile i32 9, i32 addrspace(1)* undef
				259	ret void
				260
				261	exit1: ; preds = %LeafBlock, %LeafBlock1
				262	store volatile i32 17, i32 addrspace(3)* undef
				263	ret void
				264	}
				265
				266	; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
				267	; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
				268	; IR: br i1 %2, label %LeafBlock1, label %Flow
				269
				270	; IR: Flow:
				271	; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
				272	; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
				273	; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
				274
				275	; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
				276	; IR: call void @llvm.amdgcn.end.cf(i64 %19)
				277	; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
				278
				279	define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
				280	entry:
				281	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				282	%tmp1 = add i32 0, %tmp
				283	%tmp2 = zext i32 %tmp1 to i64
				284	%tmp3 = add i64 0, %tmp2
				285	%tmp4 = shl i64 %tmp3, 32
				286	%tmp5 = ashr exact i64 %tmp4, 32
				287	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				288	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				289	%tmp8 = sext i32 %tmp7 to i64
				290	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				291	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				292	%tmp13 = zext i32 %tmp10 to i64
				293	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				294	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				295	%Pivot = icmp slt i32 %tmp16, 2
				296	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				297
				298	LeafBlock: ; preds = %entry
				299	%SwitchLeaf = icmp eq i32 %arg3, 1
				300	br i1 %SwitchLeaf, label %exit0, label %exit1
				301
				302	LeafBlock1: ; preds = %entry
				303	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				304	br i1 %SwitchLeaf2, label %exit0, label %exit1
				305
				306	exit0: ; preds = %LeafBlock, %LeafBlock1
				307	store volatile i32 9, i32 addrspace(1)* undef
				308	ret void
				309
				310	exit1: ; preds = %LeafBlock, %LeafBlock1
				311	store volatile i32 17, i32 addrspace(3)* undef
				312	ret void
				313	}
				314
				315	; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
				316	; IR: Flow2:
				317	; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
				318	; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
				319	; IR: call void @llvm.amdgcn.end.cf(i64 %20)
				320
				321	; IR: UnifiedReturnBlock:
				322	; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
				323	; IR: call void @llvm.amdgcn.end.cf(i64 %15)
				324	; IR: ret float %UnifiedRetVal
				325	define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
				326	entry:
				327	%Pivot = icmp slt i32 %vgpr, 2
				328	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				329
				330	LeafBlock: ; preds = %entry
				331	%SwitchLeaf = icmp eq i32 %vgpr, 1
				332	br i1 %SwitchLeaf, label %exit0, label %exit1
				333
				334	LeafBlock1: ; preds = %entry
				335	%SwitchLeaf2 = icmp eq i32 %vgpr, 2
				336	br i1 %SwitchLeaf2, label %exit0, label %exit1
				337
				338	exit0: ; preds = %LeafBlock, %LeafBlock1
				339	store i32 9, i32 addrspace(1)* undef
				340	ret float 1.0
				341
				342	exit1: ; preds = %LeafBlock, %LeafBlock1
				343	store i32 17, i32 addrspace(3)* undef
				344	ret float 2.0
				345	}
				346
				347	; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
				348
				349	; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
				350	; GCN: s_cmp_gt_i32 s0, 1
				351	; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
				352
				353	; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
				354
				355	; GCN: {{^}}[[FLOW]]:
				356	; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
				357
				358	; GCN: v_mov_b32_e32 v0, 2.0
				359	; GCN: s_or_b64 exec, exec
				360	; GCN: s_and_b64 exec, exec
				361	; GCN: v_mov_b32_e32 v0, 1.0
				362
				363	; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
				364	; GCN-NEXT: s_or_b64 exec, exec
				365	; GCN-NEXT: ; return
				366
				367	define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
				368	entry:
				369	%uniform.cond = icmp slt i32 %sgpr, 2
				370	br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
				371
				372	LeafBlock: ; preds = %entry
				373	%divergent.cond0 = icmp eq i32 %vgpr, 3
				374	br i1 %divergent.cond0, label %exit0, label %exit1
				375
				376	LeafBlock1: ; preds = %entry
				377	%divergent.cond1 = icmp eq i32 %vgpr, 7
				378	br i1 %divergent.cond1, label %exit0, label %exit1
				379
				380	exit0: ; preds = %LeafBlock, %LeafBlock1
				381	store i32 9, i32 addrspace(1)* undef
				382	ret float 1.0
				383
				384	exit1: ; preds = %LeafBlock, %LeafBlock1
				385	store i32 17, i32 addrspace(3)* undef
				386	ret float 2.0
				387	}
				388
				389	; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
				390	; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
				391
				392	; IR: Flow:
				393	; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
				394	; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
				395	; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
				396
				397	; IR: Flow2:
				398	; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
				399	; IR: call void @llvm.amdgcn.end.cf(i64 %19)
				400	; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
				401	; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
				402
				403	; IR: exit0:
				404	; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
				405	; IR-NEXT: br label %UnifiedReturnBlock
				406
				407	; IR: Flow1:
				408	; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
				409	; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
				410	; IR: call void @llvm.amdgcn.end.cf(i64 %8)
				411	; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
				412	; IR: %18 = extractvalue { i1, i64 } %17, 0
				413	; IR: %19 = extractvalue { i1, i64 } %17, 1
				414	; IR: br i1 %18, label %exit1, label %Flow2
				415
				416	; IR: exit1:
				417	; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
				418	; IR-NEXT: call void @llvm.amdgcn.unreachable()
				419	; IR-NEXT: br label %Flow2
				420
				421	; IR: UnifiedReturnBlock:
				422	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
				423	; IR-NEXT: ret void
				424	define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
				425	entry:
				426	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				427	%tmp1 = add i32 0, %tmp
				428	%tmp2 = zext i32 %tmp1 to i64
				429	%tmp3 = add i64 0, %tmp2
				430	%tmp4 = shl i64 %tmp3, 32
				431	%tmp5 = ashr exact i64 %tmp4, 32
				432	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				433	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				434	%tmp8 = sext i32 %tmp7 to i64
				435	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				436	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				437	%tmp13 = zext i32 %tmp10 to i64
				438	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				439	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				440	%Pivot = icmp slt i32 %tmp16, 2
				441	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				442
				443	LeafBlock: ; preds = %entry
				444	%SwitchLeaf = icmp eq i32 %tmp16, 1
				445	br i1 %SwitchLeaf, label %exit0, label %exit1
				446
				447	LeafBlock1: ; preds = %entry
				448	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				449	br i1 %SwitchLeaf2, label %exit0, label %exit1
				450
				451	exit0: ; preds = %LeafBlock, %LeafBlock1
				452	store volatile i32 17, i32 addrspace(3)* undef
				453	ret void
				454
				455	exit1: ; preds = %LeafBlock, %LeafBlock1
				456	store volatile i32 9, i32 addrspace(1)* undef
				457	unreachable
				458	}
				459
				460	; The non-uniformity of the branch to the exiting blocks requires
				461	; looking at transitive predecessors.
				462
				463	; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
				464
				465	; IR: exit0: ; preds = %Flow2
				466	; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
				467	; IR-NEXT: br label %UnifiedReturnBlock
				468
				469
				470	; IR: indirect.exit1:
				471	; IR: %load = load volatile i32, i32 addrspace(1)* undef
				472	; IR: store volatile i32 %load, i32 addrspace(1)* undef
				473	; IR: store volatile i32 9, i32 addrspace(1)* undef
				474	; IR: call void @llvm.amdgcn.unreachable()
				475	; IR-NEXT: br label %Flow2
				476
				477	; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
				478	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
				479	; IR-NEXT: ret void
				480	define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
				481	entry:
				482	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				483	%tmp1 = add i32 0, %tmp
				484	%tmp2 = zext i32 %tmp1 to i64
				485	%tmp3 = add i64 0, %tmp2
				486	%tmp4 = shl i64 %tmp3, 32
				487	%tmp5 = ashr exact i64 %tmp4, 32
				488	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				489	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				490	%tmp8 = sext i32 %tmp7 to i64
				491	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				492	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				493	%tmp13 = zext i32 %tmp10 to i64
				494	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				495	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				496	%Pivot = icmp slt i32 %tmp16, 2
				497	br i1 %Pivot, label %LeafBlock, label %LeafBlock1
				498
				499	LeafBlock: ; preds = %entry
				500	%SwitchLeaf = icmp eq i32 %tmp16, 1
				501	br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
				502
				503	LeafBlock1: ; preds = %entry
				504	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				505	br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
				506
				507	exit0: ; preds = %LeafBlock, %LeafBlock1
				508	store volatile i32 17, i32 addrspace(3)* undef
				509	ret void
				510
				511	indirect.exit1:
				512	%load = load volatile i32, i32 addrspace(1)* undef
				513	store volatile i32 %load, i32 addrspace(1)* undef
				514	br label %exit1
				515
				516	exit1: ; preds = %LeafBlock, %LeafBlock1
				517	store volatile i32 9, i32 addrspace(1)* undef
				518	unreachable
				519	}
				520
				521	; IR-LABEL: @multi_divergent_region_exit_ret_switch(
				522	define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
				523	entry:
				524	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
				525	%tmp1 = add i32 0, %tmp
				526	%tmp2 = zext i32 %tmp1 to i64
				527	%tmp3 = add i64 0, %tmp2
				528	%tmp4 = shl i64 %tmp3, 32
				529	%tmp5 = ashr exact i64 %tmp4, 32
				530	%tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
				531	%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
				532	%tmp8 = sext i32 %tmp7 to i64
				533	%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
				534	%tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
				535	%tmp13 = zext i32 %tmp10 to i64
				536	%tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
				537	%tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
				538	switch i32 %tmp16, label %exit1
				539	[ i32 1, label %LeafBlock
				540	i32 2, label %LeafBlock1
				541	i32 3, label %exit0 ]
				542
				543	LeafBlock: ; preds = %entry
				544	%SwitchLeaf = icmp eq i32 %tmp16, 1
				545	br i1 %SwitchLeaf, label %exit0, label %exit1
				546
				547	LeafBlock1: ; preds = %entry
				548	%SwitchLeaf2 = icmp eq i32 %tmp16, 2
				549	br i1 %SwitchLeaf2, label %exit0, label %exit1
				550
				551	exit0: ; preds = %LeafBlock, %LeafBlock1
				552	store volatile i32 17, i32 addrspace(3)* undef
				553	ret void
				554
				555	exit1: ; preds = %LeafBlock, %LeafBlock1
				556	store volatile i32 9, i32 addrspace(1)* undef
				557	unreachable
				558	}
				559
				560	; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
				561	define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
				562	entry:
				563	%uniform.cond0 = icmp eq i32 %arg0, 4
				564	br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
				565
				566	divergent.multi.exit.region:
				567	%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
				568	%divergent.cond0 = icmp eq i32 %id.x, 0
				569	br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
				570
				571	divergent.ret0:
				572	store volatile i32 11, i32 addrspace(3)* undef
				573	ret void
				574
				575	divergent.ret1:
				576	store volatile i32 42, i32 addrspace(3)* undef
				577	ret void
				578
				579	uniform.ret:
				580	store volatile i32 9, i32 addrspace(1)* undef
				581	ret void
				582	}
				583
				584	; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
				585	define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
				586	entry:
				587	%uniform.cond0 = icmp eq i32 %arg0, 4
				588	br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
				589
				590	divergent.multi.exit.region:
				591	%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
				592	%divergent.cond0 = icmp eq i32 %id.x, 0
				593	br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
				594
				595	divergent.if:
				596	%vgpr0 = load volatile float, float addrspace(1)* undef
				597	%divergent.cond1 = fcmp ogt float %vgpr0, 1.0
				598	br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
				599
				600	divergent.then:
				601	%vgpr1 = load volatile float, float addrspace(1)* undef
				602	%divergent.cond2 = fcmp olt float %vgpr1, 4.0
				603	store volatile i32 33, i32 addrspace(1)* undef
				604	br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
				605
				606	divergent.endif:
				607	store volatile i32 38, i32 addrspace(1)* undef
				608	br label %divergent.ret0
				609
				610	divergent.ret0:
				611	store volatile i32 11, i32 addrspace(3)* undef
				612	ret void
				613
				614	divergent.ret1:
				615	store volatile i32 42, i32 addrspace(3)* undef
				616	ret void
				617
				618	uniform.ret:
				619	store volatile i32 9, i32 addrspace(1)* undef
				620	ret void
				621	}
				622
				623	; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
				624	; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
				625	; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
				626	; IR: br i1 %8, label %uniform.if, label %Flow2
				627
				628	; IR: Flow: ; preds = %uniform.then, %uniform.if
				629	; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
				630	; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
				631
				632	; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
				633	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
				634	; IR-NEXT: ret void
				635	define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
				636	entry:
				637	%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
				638	%divergent.cond0 = icmp eq i32 %id.x, 0
				639	br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
				640
				641	uniform.multi.exit.region:
				642	%uniform.cond0 = icmp eq i32 %arg0, 4
				643	br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
				644
				645	uniform.if:
				646	%sgpr0 = load volatile i32, i32 addrspace(2)* undef
				647	%uniform.cond1 = icmp slt i32 %sgpr0, 1
				648	br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
				649
				650	uniform.then:
				651	%sgpr1 = load volatile i32, i32 addrspace(2)* undef
				652	%uniform.cond2 = icmp sge i32 %sgpr1, 4
				653	store volatile i32 33, i32 addrspace(1)* undef
				654	br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
				655
				656	uniform.endif:
				657	store volatile i32 38, i32 addrspace(1)* undef
				658	br label %uniform.ret0
				659
				660	uniform.ret0:
				661	store volatile i32 11, i32 addrspace(3)* undef
				662	ret void
				663
				664	uniform.ret1:
				665	store volatile i32 42, i32 addrspace(3)* undef
				666	ret void
				667
				668	divergent.ret:
				669	store volatile i32 9, i32 addrspace(1)* undef
				670	ret void
				671	}
				672
				673	; IR-LABEL: @multi_divergent_unreachable_exit(
				674	; IR: UnifiedUnreachableBlock:
				675	; IR-NEXT: call void @llvm.amdgcn.unreachable()
				676	; IR-NEXT: br label %UnifiedReturnBlock
				677
				678	; IR: UnifiedReturnBlock:
				679	; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
				680	; IR-NEXT: ret void
				681	define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
				682	bb:
				683	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
				684	switch i32 %tmp, label %bb3 [
				685	i32 2, label %bb1
				686	i32 0, label %bb2
				687	]
				688
				689	bb1: ; preds = %bb
				690	unreachable
				691
				692	bb2: ; preds = %bb
				693	unreachable
				694
				695	bb3: ; preds = %bb
				696	switch i32 undef, label %bb5 [
				697	i32 2, label %bb4
				698	]
				699
				700	bb4: ; preds = %bb3
				701	ret void
				702
				703	bb5: ; preds = %bb3
				704	unreachable
				705	}
				706
				707	declare i32 @llvm.amdgcn.workitem.id.x() #1
				708
				709	attributes #0 = { nounwind }
				710	attributes #1 = { nounwind readnone }