blob: 84c8d9b778c545156b644fdb111059333412e537 [file] [log] [blame]
Matt Arsenault7f833972016-02-05 19:47:29 +00001; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s
2
3@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
4@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16
5
6@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8
7@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32
8
9@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef
10@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef
11
Daniel Neilson1e687242018-01-19 17:13:12 +000012declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0
13declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0
Matt Arsenault7f833972016-02-05 19:47:29 +000014
15
16; HSA-LABEL: {{^}}test_no_round_size_1:
17; HSA: workgroup_group_segment_byte_size = 38
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000018define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +000019 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000020 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
21 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000022 ret void
23}
24
25; There are two objects, so one requires padding to to be correctly
26; aligned after the other.
27
28; (38 -> 48) + 38 = 92
29
30; I don't think it is necessary to add padding after since if there
31; were to be a dynamically sized LDS kernel arg, the runtime should
32; add the alignment padding if necessary alignment padding if needed.
33
34; HSA-LABEL: {{^}}test_round_size_2:
35; HSA: workgroup_group_segment_byte_size = 86
36; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000037define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +000038 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000039 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
40 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000041
42 %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000043 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.1.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000045
46 ret void
47}
48
49; 38 + (10 pad) + 38
50; HSA-LABEL: {{^}}test_round_size_2_align_8:
51; HSA: workgroup_group_segment_byte_size = 86
52; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000053define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +000054 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000055 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
56 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000057
58 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000059 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
60 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000061
62 ret void
63}
64
65; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
66; HSA: workgroup_group_segment_byte_size = 38
67; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000068define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +000069 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +000070 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000071
Daniel Neilson1e687242018-01-19 17:13:12 +000072 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false)
73 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
74 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000075 ret void
76}
77
78; HSA-LABEL: {{^}}test_round_lds_arg:
79; HSA: workgroup_group_segment_byte_size = 0
80; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000081define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
Daniel Neilson1e687242018-01-19 17:13:12 +000082 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false)
83 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000084 ret void
85}
86
87; FIXME: Parameter alignment not considered
88; HSA-LABEL: {{^}}test_high_align_lds_arg:
89; HSA: workgroup_group_segment_byte_size = 0
90; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000091define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
Daniel Neilson1e687242018-01-19 17:13:12 +000092 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false)
93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +000094 ret void
95}
96
97; (7 * 8) + (39 * 4) = 212
98; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
99; HSA: workgroup_group_segment_byte_size = 212
100; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000101define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000105
106 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000107 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
108 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000109
110 ret void
111}
112
113; (39 * 4) + (4 pad) + (7 * 8) = 216
114; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
115; HSA: workgroup_group_segment_byte_size = 216
116; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000117define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000118 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000119 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false)
120 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000121
122 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false)
124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000125
126 ret void
127}
128; Test how the size needed for padding changes based on when the
129; global is encountered during lowering. There should be a consistent
130; order to minimize padding waste.
131;
132; The way global addresses are lowered now, this is in inverse of
133; first use order which isn't great.
134;
135; This should be the optimal order for these globals. If sorted to
136; minimize padding, the minimum possible size is: align 32, align 8,
137; align 16
138
139
140; align 32, 16, 8
141; 38 + (10 pad) + 38 + (10 pad) + 38 = 134
142; HSA-LABEL: {{^}}test_round_size_3_order0:
143; HSA: workgroup_group_segment_byte_size = 134
144; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000145define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000146 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000147 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
148 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000149
150 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000151 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
152 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000153
154 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000155 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
156 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000157
158 ret void
159}
160
161; align 32, 8, 16
162; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134
163; HSA-LABEL: {{^}}test_round_size_3_order1:
164; HSA: workgroup_group_segment_byte_size = 134
165; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000166define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000167 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000168 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
169 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000170
171 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000172 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
173 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000174
175 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000176 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
177 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000178
179 ret void
180}
181
182; align 16, 32, 8
183; 38 + (26 pad) + 38 + (10 pad) + 38 = 150
184; HSA-LABEL: {{^}}test_round_size_3_order2:
185; HSA: workgroup_group_segment_byte_size = 150
186; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000187define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000188 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000189 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
190 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000191
192 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000193 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
194 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000195
196 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000197 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
198 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000199
200 ret void
201}
202
203; align 16, 8, 32
204; 38 + (2 pad) + 38 + (2 pad) + 38
205; HSA-LABEL: {{^}}test_round_size_3_order3:
206; HSA: workgroup_group_segment_byte_size = 118
207; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000208define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000209 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000210 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
211 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000212
213 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000214 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
215 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000216
217 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000218 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
219 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000220
221 ret void
222}
223
224; align 8, 32, 16
225; 38 + (26 pad) + 38 + (2 pad) + 38 = 142
226; HSA-LABEL: {{^}}test_round_size_3_order4:
227; HSA: workgroup_group_segment_byte_size = 142
228; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000229define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000230 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000231 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
232 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000233
234 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000235 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
236 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000237
238 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000239 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
240 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000241
242 ret void
243}
244
245; align 8, 16, 32
246; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
247; HSA-LABEL: {{^}}test_round_size_3_order5:
248; HSA: workgroup_group_segment_byte_size = 126
249; HSA: group_segment_alignment = 4
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000250define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
Matt Arsenault7f833972016-02-05 19:47:29 +0000251 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000252 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
253 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000254
255 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000256 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
257 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000258
259 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
Daniel Neilson1e687242018-01-19 17:13:12 +0000260 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false)
261 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false)
Matt Arsenault7f833972016-02-05 19:47:29 +0000262
263 ret void
264}
265
266attributes #0 = { argmemonly nounwind }
267attributes #1 = { nounwind }
268attributes #2 = { convergent nounwind }