Matt Arsenault | 7f83397 | 2016-02-05 19:47:29 +0000 | [diff] [blame^] | 1 | ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=HSA -check-prefix=FUNC %s |
| 2 | |
| 3 | @lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
| 4 | @lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 |
| 5 | |
| 6 | @lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 |
| 7 | @lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32 |
| 8 | |
| 9 | @lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef |
| 10 | @lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef |
| 11 | |
| 12 | declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1) #0 |
| 13 | declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #0 |
| 14 | |
| 15 | |
| 16 | ; HSA-LABEL: {{^}}test_no_round_size_1: |
| 17 | ; HSA: workgroup_group_segment_byte_size = 38 |
| 18 | define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 19 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 20 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 21 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) |
| 22 | ret void |
| 23 | } |
| 24 | |
| 25 | ; There are two objects, so one requires padding to to be correctly |
| 26 | ; aligned after the other. |
| 27 | |
| 28 | ; (38 -> 48) + 38 = 92 |
| 29 | |
| 30 | ; I don't think it is necessary to add padding after since if there |
| 31 | ; were to be a dynamically sized LDS kernel arg, the runtime should |
| 32 | ; add the alignment padding if necessary alignment padding if needed. |
| 33 | |
| 34 | ; HSA-LABEL: {{^}}test_round_size_2: |
| 35 | ; HSA: workgroup_group_segment_byte_size = 86 |
| 36 | ; HSA: group_segment_alignment = 4 |
| 37 | define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 38 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 39 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 40 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) |
| 41 | |
| 42 | %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)* |
| 43 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.1.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 44 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.1.bc, i32 38, i32 4, i1 false) |
| 45 | |
| 46 | ret void |
| 47 | } |
| 48 | |
| 49 | ; 38 + (10 pad) + 38 |
| 50 | ; HSA-LABEL: {{^}}test_round_size_2_align_8: |
| 51 | ; HSA: workgroup_group_segment_byte_size = 86 |
| 52 | ; HSA: group_segment_alignment = 4 |
| 53 | define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 54 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 55 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 56 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 57 | |
| 58 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 59 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 60 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 61 | |
| 62 | ret void |
| 63 | } |
| 64 | |
| 65 | ; HSA-LABEL: {{^}}test_round_local_lds_and_arg: |
| 66 | ; HSA: workgroup_group_segment_byte_size = 38 |
| 67 | ; HSA: group_segment_alignment = 4 |
| 68 | define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { |
| 69 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 70 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 71 | |
| 72 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false) |
| 73 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 74 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) |
| 75 | ret void |
| 76 | } |
| 77 | |
| 78 | ; HSA-LABEL: {{^}}test_round_lds_arg: |
| 79 | ; HSA: workgroup_group_segment_byte_size = 0 |
| 80 | ; HSA: group_segment_alignment = 4 |
| 81 | define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { |
| 82 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false) |
| 83 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false) |
| 84 | ret void |
| 85 | } |
| 86 | |
| 87 | ; FIXME: Parameter alignment not considered |
| 88 | ; HSA-LABEL: {{^}}test_high_align_lds_arg: |
| 89 | ; HSA: workgroup_group_segment_byte_size = 0 |
| 90 | ; HSA: group_segment_alignment = 4 |
| 91 | define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { |
| 92 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false) |
| 93 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false) |
| 94 | ret void |
| 95 | } |
| 96 | |
| 97 | ; (7 * 8) + (39 * 4) = 212 |
| 98 | ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: |
| 99 | ; HSA: workgroup_group_segment_byte_size = 212 |
| 100 | ; HSA: group_segment_alignment = 4 |
| 101 | define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 102 | %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* |
| 103 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) |
| 104 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) |
| 105 | |
| 106 | %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* |
| 107 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) |
| 108 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) |
| 109 | |
| 110 | ret void |
| 111 | } |
| 112 | |
| 113 | ; (39 * 4) + (4 pad) + (7 * 8) = 216 |
| 114 | ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: |
| 115 | ; HSA: workgroup_group_segment_byte_size = 216 |
| 116 | ; HSA: group_segment_alignment = 4 |
| 117 | define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 118 | %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* |
| 119 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false) |
| 120 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false) |
| 121 | |
| 122 | %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* |
| 123 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false) |
| 124 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false) |
| 125 | |
| 126 | ret void |
| 127 | } |
| 128 | ; Test how the size needed for padding changes based on when the |
| 129 | ; global is encountered during lowering. There should be a consistent |
| 130 | ; order to minimize padding waste. |
| 131 | ; |
| 132 | ; The way global addresses are lowered now, this is in inverse of |
| 133 | ; first use order which isn't great. |
| 134 | ; |
| 135 | ; This should be the optimal order for these globals. If sorted to |
| 136 | ; minimize padding, the minimum possible size is: align 32, align 8, |
| 137 | ; align 16 |
| 138 | |
| 139 | |
| 140 | ; align 32, 16, 8 |
| 141 | ; 38 + (10 pad) + 38 + (10 pad) + 38 = 134 |
| 142 | ; HSA-LABEL: {{^}}test_round_size_3_order0: |
| 143 | ; HSA: workgroup_group_segment_byte_size = 134 |
| 144 | ; HSA: group_segment_alignment = 4 |
| 145 | define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 146 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 147 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 148 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 149 | |
| 150 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 151 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 152 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 153 | |
| 154 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 155 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 156 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 157 | |
| 158 | ret void |
| 159 | } |
| 160 | |
| 161 | ; align 32, 8, 16 |
| 162 | ; 38 (+ 2 pad) + 38 + (18 pad) + 38 = 134 |
| 163 | ; HSA-LABEL: {{^}}test_round_size_3_order1: |
| 164 | ; HSA: workgroup_group_segment_byte_size = 134 |
| 165 | ; HSA: group_segment_alignment = 4 |
| 166 | define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 167 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 168 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 169 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 170 | |
| 171 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 172 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 173 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 174 | |
| 175 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 176 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 177 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 178 | |
| 179 | ret void |
| 180 | } |
| 181 | |
| 182 | ; align 16, 32, 8 |
| 183 | ; 38 + (26 pad) + 38 + (10 pad) + 38 = 150 |
| 184 | ; HSA-LABEL: {{^}}test_round_size_3_order2: |
| 185 | ; HSA: workgroup_group_segment_byte_size = 150 |
| 186 | ; HSA: group_segment_alignment = 4 |
| 187 | define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 188 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 189 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 190 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 191 | |
| 192 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 193 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 194 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 195 | |
| 196 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 197 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 198 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 199 | |
| 200 | ret void |
| 201 | } |
| 202 | |
| 203 | ; align 16, 8, 32 |
| 204 | ; 38 + (2 pad) + 38 + (2 pad) + 38 |
| 205 | ; HSA-LABEL: {{^}}test_round_size_3_order3: |
| 206 | ; HSA: workgroup_group_segment_byte_size = 118 |
| 207 | ; HSA: group_segment_alignment = 4 |
| 208 | define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 209 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 210 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 211 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 212 | |
| 213 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 214 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 215 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 216 | |
| 217 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 218 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 219 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 220 | |
| 221 | ret void |
| 222 | } |
| 223 | |
| 224 | ; align 8, 32, 16 |
| 225 | ; 38 + (26 pad) + 38 + (2 pad) + 38 = 142 |
| 226 | ; HSA-LABEL: {{^}}test_round_size_3_order4: |
| 227 | ; HSA: workgroup_group_segment_byte_size = 142 |
| 228 | ; HSA: group_segment_alignment = 4 |
| 229 | define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 230 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 231 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 232 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 233 | |
| 234 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 235 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 236 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 237 | |
| 238 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 239 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 240 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 241 | |
| 242 | ret void |
| 243 | } |
| 244 | |
| 245 | ; align 8, 16, 32 |
| 246 | ; 38 + (10 pad) + 38 + (2 pad) + 38 = 126 |
| 247 | ; HSA-LABEL: {{^}}test_round_size_3_order5: |
| 248 | ; HSA: workgroup_group_segment_byte_size = 126 |
| 249 | ; HSA: group_segment_alignment = 4 |
| 250 | define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { |
| 251 | %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* |
| 252 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 253 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false) |
| 254 | |
| 255 | %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* |
| 256 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 257 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false) |
| 258 | |
| 259 | %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* |
| 260 | call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false) |
| 261 | call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false) |
| 262 | |
| 263 | ret void |
| 264 | } |
| 265 | |
| 266 | attributes #0 = { argmemonly nounwind } |
| 267 | attributes #1 = { nounwind } |
| 268 | attributes #2 = { convergent nounwind } |