Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 1 | ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s |
| 2 | |
| 3 | @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
| 4 | @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
| 5 | |
| 6 | @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 |
| 7 | @lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32 |
| 8 | |
| 9 | @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef |
| 10 | @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef |
| 11 | |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 12 | declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0 |
| 13 | declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0 |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 14 | |
| 15 | |
| 16 | ; HSA-LABEL: {{^}}test_no_round_size_1: |
| 17 | ; HSA: workgroup_group_segment_byte_size = 38 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 18 | define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 19 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 20 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
| 21 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 22 | ret void |
| 23 | } |
| 24 | |
| 25 | ; There are two objects, so one requires padding to to be correctly |
| 26 | ; aligned after the other. |
| 27 | |
| 28 | ; (38 -> 48) + 38 = 92 |
| 29 | |
| 30 | ; I don't think it is necessary to add padding after since if there |
| 31 | ; were to be a dynamically sized LDS kernel arg, the runtime should |
| 32 | ; add the alignment padding if necessary alignment padding if needed. |
| 33 | |
| 34 | ; HSA-LABEL: {{^}}test_round_size_2: |
| 35 | ; HSA: workgroup_group_segment_byte_size = 86 |
| 36 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 37 | define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 38 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 39 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
| 40 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 41 | |
| 42 | %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 43 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.1.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
| 44 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 45 | |
| 46 | ret void |
| 47 | } |
| 48 | |
| 49 | ; 38 + (10 pad) + 38 |
| 50 | ; HSA-LABEL: {{^}}test_round_size_2_align_8: |
| 51 | ; HSA: workgroup_group_segment_byte_size = 86 |
| 52 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 53 | define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 54 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 55 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 56 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 57 | |
| 58 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 59 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 60 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 61 | |
| 62 | ret void |
| 63 | } |
| 64 | |
| 65 | ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: |
| 66 | ; HSA: workgroup_group_segment_byte_size = 38 |
| 67 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 68 | define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 69 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 70 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 71 | |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 72 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) |
| 73 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
| 74 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 75 | ret void |
| 76 | } |
| 77 | |
| 78 | ; HSA-LABEL: {{^}}test_round_lds_arg: |
| 79 | ; HSA: workgroup_group_segment_byte_size = 0 |
| 80 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 81 | define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 82 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false) |
| 83 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 84 | ret void |
| 85 | } |
| 86 | |
| 87 | ; FIXME: Parameter alignment not considered |
| 88 | ; HSA-LABEL: {{^}}test_high_align_lds_arg: |
| 89 | ; HSA: workgroup_group_segment_byte_size = 0 |
| 90 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 91 | define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 92 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false) |
| 93 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 94 | ret void |
| 95 | } |
| 96 | |
| 97 | ; (7 * 8) + (39 * 4) = 212 |
| 98 | ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: |
| 99 | ; HSA: workgroup_group_segment_byte_size = 212 |
| 100 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 101 | define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 102 | %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 103 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) |
| 104 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 105 | |
| 106 | %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 107 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false) |
| 108 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 109 | |
| 110 | ret void |
| 111 | } |
| 112 | |
| 113 | ; (39 * 4) + (4 pad) + (7 * 8) = 216 |
| 114 | ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: |
| 115 | ; HSA: workgroup_group_segment_byte_size = 216 |
| 116 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 117 | define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 118 | %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 119 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false) |
| 120 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 121 | |
| 122 | %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 123 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) |
| 124 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 125 | |
| 126 | ret void |
| 127 | } |
| 128 | ; Test how the size needed for padding changes based on when the |
| 129 | ; global is encountered during lowering. There should be a consistent |
| 130 | ; order to minimize padding waste. |
| 131 | ; |
| 132 | ; The way global addresses are lowered now, this is in inverse of |
| 133 | ; first use order which isn't great. |
| 134 | ; |
| 135 | ; This should be the optimal order for these globals. If sorted to |
| 136 | ; minimize padding, the minimum possible size is: align 32, align 8, |
| 137 | ; align 16 |
| 138 | |
| 139 | |
| 140 | ; align 32, 16, 8 |
| 141 | ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 |
| 142 | ; HSA-LABEL: {{^}}test_round_size_3_order0: |
| 143 | ; HSA: workgroup_group_segment_byte_size = 134 |
| 144 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 145 | define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 146 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 147 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 148 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 149 | |
| 150 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 151 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 152 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 153 | |
| 154 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 155 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 156 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 157 | |
| 158 | ret void |
| 159 | } |
| 160 | |
| 161 | ; align 32, 8, 16 |
| 162 | ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 |
| 163 | ; HSA-LABEL: {{^}}test_round_size_3_order1: |
| 164 | ; HSA: workgroup_group_segment_byte_size = 134 |
| 165 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 166 | define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 167 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 168 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 169 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 170 | |
| 171 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 172 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 173 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 174 | |
| 175 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 176 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 177 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 178 | |
| 179 | ret void |
| 180 | } |
| 181 | |
| 182 | ; align 16, 32, 8 |
| 183 | ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 |
| 184 | ; HSA-LABEL: {{^}}test_round_size_3_order2: |
| 185 | ; HSA: workgroup_group_segment_byte_size = 150 |
| 186 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 187 | define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 188 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 189 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 190 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 191 | |
| 192 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 193 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 194 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 195 | |
| 196 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 197 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 198 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 199 | |
| 200 | ret void |
| 201 | } |
| 202 | |
| 203 | ; align 16, 8, 32 |
| 204 | ; 38 + (2 pad) + 38 + (2 pad) + 38 |
| 205 | ; HSA-LABEL: {{^}}test_round_size_3_order3: |
| 206 | ; HSA: workgroup_group_segment_byte_size = 118 |
| 207 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 208 | define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 209 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 210 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 211 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 212 | |
| 213 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 214 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 215 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 216 | |
| 217 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 218 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 219 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 220 | |
| 221 | ret void |
| 222 | } |
| 223 | |
| 224 | ; align 8, 32, 16 |
| 225 | ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 |
| 226 | ; HSA-LABEL: {{^}}test_round_size_3_order4: |
| 227 | ; HSA: workgroup_group_segment_byte_size = 142 |
| 228 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 229 | define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 230 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 231 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 232 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 233 | |
| 234 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 235 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 236 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 237 | |
| 238 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 239 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 240 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 241 | |
| 242 | ret void |
| 243 | } |
| 244 | |
| 245 | ; align 8, 16, 32 |
| 246 | ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 |
| 247 | ; HSA-LABEL: {{^}}test_round_size_3_order5: |
| 248 | ; HSA: workgroup_group_segment_byte_size = 126 |
| 249 | ; HSA: group_segment_alignment = 4 |
Matt Arsenault | 3dbeefa | 2017-03-21 21:39:51 +0000 | [diff] [blame] | 250 | define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 251 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 252 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 253 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 254 | |
| 255 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 256 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 257 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 258 | |
| 259 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
Daniel Neilson | 1e68724 | 2018-01-19 17:13:12 +0000 | [diff] [blame] | 260 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) |
| 261 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) |
Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame] | 262 | |
| 263 | ret void |
| 264 | } |
| 265 | |
| 266 | attributes #0 = { argmemonly nounwind } |
| 267 | attributes #1 = { nounwind } |
| 268 | attributes #2 = { convergent nounwind } |