Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1 | ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s |
| 2 | ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s |
| 3 | ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032 %s |
| 4 | ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1064 %s |
| 5 | ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s |
| 6 | |
| 7 | ; GCN-LABEL: {{^}}test_vopc_i32: |
| 8 | ; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} |
| 9 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc_lo |
| 10 | ; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} |
| 11 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, vcc{{$}} |
| 12 | define amdgpu_kernel void @test_vopc_i32(i32 addrspace(1)* %arg) { |
| 13 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 14 | %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid |
| 15 | %load = load i32, i32 addrspace(1)* %gep, align 4 |
| 16 | %cmp = icmp sgt i32 %load, 0 |
| 17 | %sel = select i1 %cmp, i32 1, i32 2 |
| 18 | store i32 %sel, i32 addrspace(1)* %gep, align 4 |
| 19 | ret void |
| 20 | } |
| 21 | |
| 22 | ; GCN-LABEL: {{^}}test_vopc_f32: |
| 23 | ; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} |
| 24 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc_lo |
| 25 | ; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} |
| 26 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, vcc{{$}} |
| 27 | define amdgpu_kernel void @test_vopc_f32(float addrspace(1)* %arg) { |
| 28 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 29 | %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid |
| 30 | %load = load float, float addrspace(1)* %gep, align 4 |
| 31 | %cmp = fcmp ugt float %load, 0.0 |
| 32 | %sel = select i1 %cmp, float 1.0, float 2.0 |
| 33 | store float %sel, float addrspace(1)* %gep, align 4 |
| 34 | ret void |
| 35 | } |
| 36 | |
| 37 | ; GCN-LABEL: {{^}}test_vopc_vcmpx: |
| 38 | ; GFX1032: v_cmpx_le_f32_e32 0, v{{[0-9]+}} |
| 39 | ; GFX1064: v_cmpx_le_f32_e32 0, v{{[0-9]+}} |
| 40 | define amdgpu_ps void @test_vopc_vcmpx(float %x) { |
| 41 | %cmp = fcmp oge float %x, 0.0 |
| 42 | call void @llvm.amdgcn.kill(i1 %cmp) |
| 43 | ret void |
| 44 | } |
| 45 | |
| 46 | ; GCN-LABEL: {{^}}test_vopc_2xf16: |
| 47 | ; GFX1032: v_cmp_le_f16_sdwa [[SC:s[0-9]+]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD |
| 48 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] |
| 49 | ; GFX1064: v_cmp_le_f16_sdwa [[SC:s\[[0-9:]+\]]], {{[vs][0-9]+}}, v{{[0-9]+}} src0_sel:WORD_1 src1_sel:DWORD |
| 50 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0x3c003c00, v{{[0-9]+}}, [[SC]] |
| 51 | define amdgpu_kernel void @test_vopc_2xf16(<2 x half> addrspace(1)* %arg) { |
| 52 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 53 | %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %lid |
| 54 | %load = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 |
| 55 | %elt = extractelement <2 x half> %load, i32 1 |
| 56 | %cmp = fcmp ugt half %elt, 0.0 |
| 57 | %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load |
| 58 | store <2 x half> %sel, <2 x half> addrspace(1)* %gep, align 4 |
| 59 | ret void |
| 60 | } |
| 61 | |
| 62 | ; GCN-LABEL: {{^}}test_vopc_class: |
| 63 | ; GFX1032: v_cmp_class_f32_e64 [[C:vcc_lo|s[0-9:]+]], s{{[0-9]+}}, 0x204 |
| 64 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] |
| 65 | ; GFX1064: v_cmp_class_f32_e64 [[C:vcc|s\[[0-9:]+\]]], s{{[0-9]+}}, 0x204 |
| 66 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]]{{$}} |
| 67 | define amdgpu_kernel void @test_vopc_class(i32 addrspace(1)* %out, float %x) #0 { |
| 68 | %fabs = tail call float @llvm.fabs.f32(float %x) |
| 69 | %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 |
| 70 | %ext = zext i1 %cmp to i32 |
| 71 | store i32 %ext, i32 addrspace(1)* %out, align 4 |
| 72 | ret void |
| 73 | } |
| 74 | |
| 75 | ; GCN-LABEL: {{^}}test_vcmp_vcnd_f16: |
| 76 | ; GFX1032: v_cmp_neq_f16_e64 [[C:vcc_lo|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} |
| 77 | ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]] |
| 78 | |
| 79 | ; GFX1064: v_cmp_neq_f16_e64 [[C:vcc|s\[[0-9:]+\]]], 0x7c00, s{{[0-9]+}} |
| 80 | ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 0x3c00, v{{[0-9]+}}, [[C]]{{$}} |
| 81 | define amdgpu_kernel void @test_vcmp_vcnd_f16(half addrspace(1)* %out, half %x) #0 { |
| 82 | %cmp = fcmp oeq half %x, 0x7FF0000000000000 |
| 83 | %sel = select i1 %cmp, half 1.0, half %x |
| 84 | store half %sel, half addrspace(1)* %out, align 2 |
| 85 | ret void |
| 86 | } |
| 87 | |
| 88 | ; GCN-LABEL: {{^}}test_vop3_cmp_f32_sop_and: |
| 89 | ; GFX1032: v_cmp_nge_f32_e32 vcc_lo, 0, v{{[0-9]+}} |
| 90 | ; GFX1032: v_cmp_nle_f32_e64 [[C2:s[0-9]+]], 1.0, v{{[0-9]+}} |
| 91 | ; GFX1032: s_and_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] |
| 92 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] |
| 93 | ; GFX1064: v_cmp_nge_f32_e32 vcc, 0, v{{[0-9]+}} |
| 94 | ; GFX1064: v_cmp_nle_f32_e64 [[C2:s\[[0-9:]+\]]], 1.0, v{{[0-9]+}} |
| 95 | ; GFX1064: s_and_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] |
| 96 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0, [[AND]] |
| 97 | define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(float addrspace(1)* %arg) { |
| 98 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 99 | %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %lid |
| 100 | %load = load float, float addrspace(1)* %gep, align 4 |
| 101 | %cmp = fcmp ugt float %load, 0.0 |
| 102 | %cmp2 = fcmp ult float %load, 1.0 |
| 103 | %and = and i1 %cmp, %cmp2 |
| 104 | %sel = select i1 %and, float 1.0, float 2.0 |
| 105 | store float %sel, float addrspace(1)* %gep, align 4 |
| 106 | ret void |
| 107 | } |
| 108 | |
| 109 | ; GCN-LABEL: {{^}}test_vop3_cmp_i32_sop_xor: |
| 110 | ; GFX1032: v_cmp_lt_i32_e32 vcc_lo, 0, v{{[0-9]+}} |
| 111 | ; GFX1032: v_cmp_gt_i32_e64 [[C2:s[0-9]+]], 1, v{{[0-9]+}} |
| 112 | ; GFX1032: s_xor_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] |
| 113 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] |
| 114 | ; GFX1064: v_cmp_lt_i32_e32 vcc, 0, v{{[0-9]+}} |
| 115 | ; GFX1064: v_cmp_gt_i32_e64 [[C2:s\[[0-9:]+\]]], 1, v{{[0-9]+}} |
| 116 | ; GFX1064: s_xor_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] |
| 117 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] |
| 118 | define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(i32 addrspace(1)* %arg) { |
| 119 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 120 | %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid |
| 121 | %load = load i32, i32 addrspace(1)* %gep, align 4 |
| 122 | %cmp = icmp sgt i32 %load, 0 |
| 123 | %cmp2 = icmp slt i32 %load, 1 |
| 124 | %xor = xor i1 %cmp, %cmp2 |
| 125 | %sel = select i1 %xor, i32 1, i32 2 |
| 126 | store i32 %sel, i32 addrspace(1)* %gep, align 4 |
| 127 | ret void |
| 128 | } |
| 129 | |
| 130 | ; GCN-LABEL: {{^}}test_vop3_cmp_u32_sop_or: |
| 131 | ; GFX1032: v_cmp_lt_u32_e32 vcc_lo, 3, v{{[0-9]+}} |
| 132 | ; GFX1032: v_cmp_gt_u32_e64 [[C2:s[0-9]+]], 2, v{{[0-9]+}} |
| 133 | ; GFX1032: s_or_b32 [[AND:s[0-9]+]], vcc_lo, [[C2]] |
| 134 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] |
| 135 | ; GFX1064: v_cmp_lt_u32_e32 vcc, 3, v{{[0-9]+}} |
| 136 | ; GFX1064: v_cmp_gt_u32_e64 [[C2:s\[[0-9:]+\]]], 2, v{{[0-9]+}} |
| 137 | ; GFX1064: s_or_b64 [[AND:s\[[0-9:]+\]]], vcc, [[C2]] |
| 138 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 2, 1, [[AND]] |
| 139 | define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(i32 addrspace(1)* %arg) { |
| 140 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 141 | %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %lid |
| 142 | %load = load i32, i32 addrspace(1)* %gep, align 4 |
| 143 | %cmp = icmp ugt i32 %load, 3 |
| 144 | %cmp2 = icmp ult i32 %load, 2 |
| 145 | %or = or i1 %cmp, %cmp2 |
| 146 | %sel = select i1 %or, i32 1, i32 2 |
| 147 | store i32 %sel, i32 addrspace(1)* %gep, align 4 |
| 148 | ret void |
| 149 | } |
| 150 | |
| 151 | ; GCN-LABEL: {{^}}test_mask_if: |
| 152 | ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo |
| 153 | ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} |
| 154 | ; GCN: ; mask branch |
| 155 | define amdgpu_kernel void @test_mask_if(i32 addrspace(1)* %arg) #0 { |
| 156 | %lid = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 157 | %cmp = icmp ugt i32 %lid, 10 |
| 158 | br i1 %cmp, label %if, label %endif |
| 159 | |
| 160 | if: |
| 161 | store i32 0, i32 addrspace(1)* %arg, align 4 |
| 162 | br label %endif |
| 163 | |
| 164 | endif: |
| 165 | ret void |
| 166 | } |
| 167 | |
| 168 | ; GCN-LABEL: {{^}}test_loop_with_if: |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 169 | ; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} |
| 170 | ; GFX1032: s_andn2_b32 exec_lo, exec_lo, s{{[0-9]+}} |
| 171 | ; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] |
| 172 | ; GFX1064: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] |
| 173 | ; GCN: s_cbranch_execz |
| 174 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 175 | ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo |
| 176 | ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} |
| 177 | ; GCN: s_cbranch_execz |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 178 | ; GCN: BB{{.*}}: |
| 179 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 180 | ; GFX1032: s_xor_b32 s{{[0-9]+}}, exec_lo, s{{[0-9]+}} |
| 181 | ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] |
| 182 | ; GCN: ; mask branch BB |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 183 | ; GCN: BB{{.*}}: |
| 184 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 185 | ; GFX1032: s_or_b32 exec_lo, exec_lo, s{{[0-9]+}} |
| 186 | ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, s{{[0-9]+}} |
| 187 | ; GFX1064: s_or_b64 exec, exec, s[{{[0-9:]+}}] |
| 188 | ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} |
| 189 | ; GCN: ; mask branch BB |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 190 | ; GCN: BB{{.*}}: |
| 191 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 192 | ; GCN: s_endpgm |
| 193 | define amdgpu_kernel void @test_loop_with_if(i32 addrspace(1)* %arg) #0 { |
| 194 | bb: |
| 195 | %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 196 | br label %bb2 |
| 197 | |
| 198 | bb1: |
| 199 | ret void |
| 200 | |
| 201 | bb2: |
| 202 | %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ] |
| 203 | %tmp4 = icmp slt i32 %tmp3, %tmp |
| 204 | br i1 %tmp4, label %bb5, label %bb11 |
| 205 | |
| 206 | bb5: |
| 207 | %tmp6 = sext i32 %tmp3 to i64 |
| 208 | %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 |
| 209 | %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 |
| 210 | %tmp9 = icmp sgt i32 %tmp8, 10 |
| 211 | br i1 %tmp9, label %bb10, label %bb11 |
| 212 | |
| 213 | bb10: |
| 214 | store i32 %tmp, i32 addrspace(1)* %tmp7, align 4 |
| 215 | br label %bb13 |
| 216 | |
| 217 | bb11: |
| 218 | %tmp12 = sdiv i32 %tmp3, 2 |
| 219 | br label %bb13 |
| 220 | |
| 221 | bb13: |
| 222 | %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ] |
| 223 | %tmp15 = add nsw i32 %tmp14, 1 |
| 224 | %tmp16 = icmp slt i32 %tmp14, 255 |
| 225 | br i1 %tmp16, label %bb2, label %bb1 |
| 226 | } |
| 227 | |
| 228 | ; GCN-LABEL: {{^}}test_loop_with_if_else_break: |
| 229 | ; GFX1032: s_and_saveexec_b32 s{{[0-9]+}}, vcc_lo |
| 230 | ; GFX1064: s_and_saveexec_b64 s[{{[0-9:]+}}], vcc{{$}} |
| 231 | ; GCN: ; mask branch |
| 232 | ; GCN: s_cbranch_execz |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 233 | ; GCN: BB{{.*}}: |
| 234 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 235 | ; GFX1032: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, exec_lo |
| 236 | ; GFX1064: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], exec |
| 237 | ; GFX1032: s_or_b32 s{{[0-9]+}}, vcc_lo, s{{[0-9]+}} |
| 238 | ; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} |
| 239 | ; GFX1064: s_or_b64 s[{{[0-9:]+}}], vcc, s[{{[0-9:]+}}] |
| 240 | ; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 241 | ; GCN: s_cbranch_execz |
| 242 | ; GCN: BB{{.*}}: |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 243 | define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 { |
| 244 | bb: |
| 245 | %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 246 | %tmp1 = icmp eq i32 %tmp, 0 |
| 247 | br i1 %tmp1, label %.loopexit, label %.preheader |
| 248 | |
| 249 | .preheader: |
| 250 | br label %bb2 |
| 251 | |
| 252 | bb2: |
| 253 | %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ] |
| 254 | %tmp4 = zext i32 %tmp3 to i64 |
| 255 | %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4 |
| 256 | %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 |
| 257 | %tmp7 = icmp sgt i32 %tmp6, 10 |
| 258 | br i1 %tmp7, label %bb8, label %.loopexit |
| 259 | |
| 260 | bb8: |
| 261 | store i32 %tmp, i32 addrspace(1)* %tmp5, align 4 |
| 262 | %tmp9 = add nuw nsw i32 %tmp3, 1 |
| 263 | %tmp10 = icmp ult i32 %tmp9, 256 |
| 264 | %tmp11 = icmp ult i32 %tmp9, %tmp |
| 265 | %tmp12 = and i1 %tmp10, %tmp11 |
| 266 | br i1 %tmp12, label %bb2, label %.loopexit |
| 267 | |
| 268 | .loopexit: |
| 269 | ret void |
| 270 | } |
| 271 | |
| 272 | ; GCN-LABEL: {{^}}test_addc_vop2b: |
| 273 | ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}} |
| 274 | ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo |
| 275 | ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}} |
| 276 | ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}} |
| 277 | define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { |
| 278 | bb: |
| 279 | %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 280 | %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp |
| 281 | %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 |
| 282 | %tmp5 = add nsw i64 %tmp4, %arg1 |
| 283 | store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 |
| 284 | ret void |
| 285 | } |
| 286 | |
| 287 | ; GCN-LABEL: {{^}}test_subbrev_vop2b: |
| 288 | ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} |
| 289 | ; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} |
| 290 | ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}} |
| 291 | ; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}} |
| 292 | define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { |
| 293 | bb: |
| 294 | %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 295 | %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp |
| 296 | %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 |
| 297 | %tmp5 = sub nsw i64 %tmp4, %arg1 |
| 298 | store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 |
| 299 | ret void |
| 300 | } |
| 301 | |
| 302 | ; GCN-LABEL: {{^}}test_subb_vop2b: |
| 303 | ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} |
| 304 | ; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} |
| 305 | ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}} |
| 306 | ; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}} |
| 307 | define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 { |
| 308 | bb: |
| 309 | %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 310 | %tmp3 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tmp |
| 311 | %tmp4 = load i64, i64 addrspace(1)* %tmp3, align 8 |
| 312 | %tmp5 = sub nsw i64 %arg1, %tmp4 |
| 313 | store i64 %tmp5, i64 addrspace(1)* %tmp3, align 8 |
| 314 | ret void |
| 315 | } |
| 316 | |
| 317 | ; GCN-LABEL: {{^}}test_udiv64: |
| 318 | ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} |
| 319 | ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo |
| 320 | ; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] |
| 321 | ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} |
| 322 | ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} |
| 323 | ; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}} |
| 324 | ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo |
| 325 | ; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} |
| 326 | ; GFX1032: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo |
| 327 | ; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo |
| 328 | ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}} |
| 329 | ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} |
| 330 | ; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]] |
| 331 | ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} |
| 332 | ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} |
| 333 | ; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} |
| 334 | ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} |
| 335 | ; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}} |
| 336 | ; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} |
| 337 | ; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc{{$}} |
| 338 | define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 { |
| 339 | bb: |
| 340 | %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1 |
| 341 | %tmp1 = load i64, i64 addrspace(1)* %tmp, align 8 |
| 342 | %tmp2 = load i64, i64 addrspace(1)* %arg, align 8 |
| 343 | %tmp3 = udiv i64 %tmp1, %tmp2 |
| 344 | %tmp4 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 2 |
| 345 | store i64 %tmp3, i64 addrspace(1)* %tmp4, align 8 |
| 346 | ret void |
| 347 | } |
| 348 | |
| 349 | ; GCN-LABEL: {{^}}test_div_scale_f32: |
| 350 | ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} |
| 351 | ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} |
| 352 | define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { |
| 353 | %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone |
| 354 | %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid |
| 355 | %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 |
| 356 | |
| 357 | %a = load volatile float, float addrspace(1)* %gep.0, align 4 |
| 358 | %b = load volatile float, float addrspace(1)* %gep.1, align 4 |
| 359 | |
| 360 | %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone |
| 361 | %result0 = extractvalue { float, i1 } %result, 0 |
| 362 | store float %result0, float addrspace(1)* %out, align 4 |
| 363 | ret void |
| 364 | } |
| 365 | |
| 366 | ; GCN-LABEL: {{^}}test_div_scale_f64: |
| 367 | ; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] |
| 368 | ; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] |
| 369 | define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 { |
| 370 | %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone |
| 371 | %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid |
| 372 | %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 |
| 373 | |
| 374 | %a = load volatile double, double addrspace(1)* %gep.0, align 8 |
| 375 | %b = load volatile double, double addrspace(1)* %gep.1, align 8 |
| 376 | |
| 377 | %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone |
| 378 | %result0 = extractvalue { double, i1 } %result, 0 |
| 379 | store double %result0, double addrspace(1)* %out, align 8 |
| 380 | ret void |
| 381 | } |
| 382 | |
| 383 | ; GCN-LABEL: {{^}}test_mad_i64_i32: |
| 384 | ; GFX1032: v_mad_i64_i32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] |
| 385 | ; GFX1064: v_mad_i64_i32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] |
| 386 | define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { |
| 387 | %sext0 = sext i32 %arg0 to i64 |
| 388 | %sext1 = sext i32 %arg1 to i64 |
| 389 | %mul = mul i64 %sext0, %sext1 |
| 390 | %mad = add i64 %mul, %arg2 |
| 391 | ret i64 %mad |
| 392 | } |
| 393 | |
| 394 | ; GCN-LABEL: {{^}}test_mad_u64_u32: |
| 395 | ; GFX1032: v_mad_u64_u32 v[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] |
| 396 | ; GFX1064: v_mad_u64_u32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9:]+}}] |
| 397 | define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { |
| 398 | %sext0 = zext i32 %arg0 to i64 |
| 399 | %sext1 = zext i32 %arg1 to i64 |
| 400 | %mul = mul i64 %sext0, %sext1 |
| 401 | %mad = add i64 %mul, %arg2 |
| 402 | ret i64 %mad |
| 403 | } |
| 404 | |
| 405 | ; GCN-LABEL: {{^}}test_div_fmas_f32: |
| 406 | ; GFX1032: v_cmp_eq_u32_e64 vcc_lo, |
| 407 | ; GFX1064: v_cmp_eq_u32_e64 vcc, |
| 408 | ; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} |
| 409 | define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { |
| 410 | %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone |
| 411 | store float %result, float addrspace(1)* %out, align 4 |
| 412 | ret void |
| 413 | } |
| 414 | |
| 415 | ; GCN-LABEL: {{^}}test_div_fmas_f64: |
| 416 | ; GFX1032: v_cmp_eq_u32_e64 vcc_lo, |
| 417 | ; GFX1064: v_cmp_eq_u32_e64 vcc, |
| 418 | ; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] |
| 419 | define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { |
| 420 | %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone |
| 421 | store double %result, double addrspace(1)* %out, align 8 |
| 422 | ret void |
| 423 | } |
| 424 | |
| 425 | ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc: |
| 426 | ; GFX1032: s_mov_b32 [[VCC:vcc_lo]], 0{{$}} |
| 427 | ; GFX1064: s_mov_b64 [[VCC:vcc]], 0{{$}} |
| 428 | ; GFX1032: s_and_saveexec_b32 [[SAVE:s[0-9]+]], s{{[0-9]+}}{{$}} |
| 429 | ; GFX1064: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], s[{{[0-9:]+}}]{{$}} |
| 430 | |
| 431 | ; GCN: load_dword [[LOAD:v[0-9]+]] |
| 432 | ; GCN: v_cmp_ne_u32_e32 [[VCC]], 0, [[LOAD]] |
| 433 | |
| 434 | ; GCN: BB{{[0-9_]+}}: |
| 435 | ; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE]] |
| 436 | ; GFX1064: s_or_b64 exec, exec, [[SAVE]] |
| 437 | ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} |
| 438 | define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) #0 { |
| 439 | entry: |
| 440 | %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone |
| 441 | %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 |
| 442 | %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid |
| 443 | %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 |
| 444 | %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2 |
| 445 | |
| 446 | %a = load float, float addrspace(1)* %gep.a |
| 447 | %b = load float, float addrspace(1)* %gep.b |
| 448 | %c = load float, float addrspace(1)* %gep.c |
| 449 | |
| 450 | %cmp0 = icmp eq i32 %tid, 0 |
| 451 | br i1 %cmp0, label %bb, label %exit |
| 452 | |
| 453 | bb: |
| 454 | %val = load volatile i32, i32 addrspace(1)* %dummy |
| 455 | %cmp1 = icmp ne i32 %val, 0 |
| 456 | br label %exit |
| 457 | |
| 458 | exit: |
| 459 | %cond = phi i1 [false, %entry], [%cmp1, %bb] |
| 460 | %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone |
| 461 | store float %result, float addrspace(1)* %gep.out, align 4 |
| 462 | ret void |
| 463 | } |
| 464 | |
| 465 | ; GCN-LABEL: {{^}}fdiv_f32: |
| 466 | ; GFC1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} |
| 467 | ; GFC1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} |
| 468 | ; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} |
| 469 | ; GCN-NOT: vcc |
| 470 | ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} |
| 471 | define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { |
| 472 | entry: |
| 473 | %fdiv = fdiv float %a, %b |
| 474 | store float %fdiv, float addrspace(1)* %out |
| 475 | ret void |
| 476 | } |
| 477 | |
| 478 | ; GCN-LABEL: {{^}}test_br_cc_f16: |
| 479 | ; GFX1032: v_cmp_nlt_f16_e32 vcc_lo, |
| 480 | ; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo |
| 481 | ; GFX1064: v_cmp_nlt_f16_e32 vcc, |
| 482 | ; GFX1064-NEXT: s_and_b64 vcc, exec, vcc{{$}} |
| 483 | ; GCN-NEXT: s_cbranch_vccnz |
| 484 | define amdgpu_kernel void @test_br_cc_f16( |
| 485 | half addrspace(1)* %r, |
| 486 | half addrspace(1)* %a, |
| 487 | half addrspace(1)* %b) { |
| 488 | entry: |
| 489 | %a.val = load half, half addrspace(1)* %a |
| 490 | %b.val = load half, half addrspace(1)* %b |
| 491 | %fcmp = fcmp olt half %a.val, %b.val |
| 492 | br i1 %fcmp, label %one, label %two |
| 493 | |
| 494 | one: |
| 495 | store half %a.val, half addrspace(1)* %r |
| 496 | ret void |
| 497 | |
| 498 | two: |
| 499 | store half %b.val, half addrspace(1)* %r |
| 500 | ret void |
| 501 | } |
| 502 | |
| 503 | ; GCN-LABEL: {{^}}test_brcc_i1: |
| 504 | ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0 |
| 505 | ; GCN-NEXT: s_cbranch_scc1 |
| 506 | define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 { |
| 507 | %cmp0 = icmp ne i1 %val, 0 |
| 508 | br i1 %cmp0, label %store, label %end |
| 509 | |
| 510 | store: |
| 511 | store i32 222, i32 addrspace(1)* %out |
| 512 | ret void |
| 513 | |
| 514 | end: |
| 515 | ret void |
| 516 | } |
| 517 | |
| 518 | ; GCN-LABEL: {{^}}test_preserve_condition_undef_flag: |
| 519 | ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 |
| 520 | ; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 |
| 521 | ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 |
| 522 | ; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} |
| 523 | ; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}} |
| 524 | ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]] |
| 525 | ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 |
| 526 | ; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 |
| 527 | ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 |
| 528 | ; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| 529 | ; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}] |
| 530 | ; GFX1064: s_and_b64 vcc, exec, [[OR2]] |
| 531 | ; GCN: s_cbranch_vccnz |
| 532 | define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 { |
| 533 | bb0: |
| 534 | %tmp = icmp sgt i32 %arg1, 4 |
| 535 | %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) |
| 536 | %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 |
| 537 | %tmp5 = fcmp ogt float %arg2, 0.000000e+00 |
| 538 | %tmp6 = fcmp olt float %arg2, 1.000000e+00 |
| 539 | %tmp7 = fcmp olt float %arg, %tmp4 |
| 540 | %tmp8 = and i1 %tmp5, %tmp6 |
| 541 | %tmp9 = and i1 %tmp8, %tmp7 |
| 542 | br i1 %tmp9, label %bb1, label %bb2 |
| 543 | |
| 544 | bb1: |
| 545 | store volatile i32 0, i32 addrspace(1)* undef |
| 546 | br label %bb2 |
| 547 | |
| 548 | bb2: |
| 549 | ret void |
| 550 | } |
| 551 | |
| 552 | ; GCN-LABEL: {{^}}test_invert_true_phi_cond_break_loop: |
| 553 | ; GFX1032: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, -1 |
| 554 | ; GFX1032: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} |
| 555 | ; GFX1064: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], -1 |
| 556 | ; GFX1064: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| 557 | define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { |
| 558 | bb: |
| 559 | %id = call i32 @llvm.amdgcn.workitem.id.x() |
| 560 | %tmp = sub i32 %id, %arg |
| 561 | br label %bb1 |
| 562 | |
| 563 | bb1: ; preds = %Flow, %bb |
| 564 | %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] |
| 565 | %lsr.iv.next = add i32 %lsr.iv, 1 |
| 566 | %cmp0 = icmp slt i32 %lsr.iv.next, 0 |
| 567 | br i1 %cmp0, label %bb4, label %Flow |
| 568 | |
| 569 | bb4: ; preds = %bb1 |
| 570 | %load = load volatile i32, i32 addrspace(1)* undef, align 4 |
| 571 | %cmp1 = icmp sge i32 %tmp, %load |
| 572 | br label %Flow |
| 573 | |
| 574 | Flow: ; preds = %bb4, %bb1 |
| 575 | %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] |
| 576 | %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] |
| 577 | br i1 %tmp3, label %bb1, label %bb9 |
| 578 | |
| 579 | bb9: ; preds = %Flow |
| 580 | store volatile i32 7, i32 addrspace(3)* undef |
| 581 | ret void |
| 582 | } |
| 583 | |
| 584 | ; GCN-LABEL: {{^}}test_movrels_extract_neg_offset_vgpr: |
| 585 | ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 1, v{{[0-9]+}} |
| 586 | ; GFX1032: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc_lo |
| 587 | ; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 2, v{{[0-9]+}} |
| 588 | ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc_lo |
| 589 | ; GFX1032: v_cmp_ne_u32_e32 vcc_lo, 3, v{{[0-9]+}} |
| 590 | ; GFX1032: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc_lo |
| 591 | ; GFX1064: v_cmp_eq_u32_e32 vcc, 1, v{{[0-9]+}} |
| 592 | ; GFX1064: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc |
| 593 | ; GFX1064: v_cmp_ne_u32_e32 vcc, 2, v{{[0-9]+}} |
| 594 | ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}, vcc |
| 595 | ; GFX1064: v_cmp_ne_u32_e32 vcc, 3, v{{[0-9]+}} |
| 596 | ; GFX1064: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc |
| 597 | define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(i32 addrspace(1)* %out) #0 { |
| 598 | entry: |
| 599 | %id = call i32 @llvm.amdgcn.workitem.id.x() #1 |
| 600 | %index = add i32 %id, -512 |
| 601 | %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index |
| 602 | store i32 %value, i32 addrspace(1)* %out |
| 603 | ret void |
| 604 | } |
| 605 | |
| 606 | ; GCN-LABEL: {{^}}test_set_inactive: |
| 607 | ; GFX1032: s_not_b32 exec_lo, exec_lo |
| 608 | ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 42 |
| 609 | ; GFX1032: s_not_b32 exec_lo, exec_lo |
| 610 | ; GFX1064: s_not_b64 exec, exec{{$}} |
| 611 | ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 42 |
| 612 | ; GFX1064: s_not_b64 exec, exec{{$}} |
| 613 | define amdgpu_kernel void @test_set_inactive(i32 addrspace(1)* %out, i32 %in) #0 { |
| 614 | %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) |
| 615 | store i32 %tmp, i32 addrspace(1)* %out |
| 616 | ret void |
| 617 | } |
| 618 | |
| 619 | ; GCN-LABEL: {{^}}test_set_inactive_64: |
| 620 | ; GFX1032: s_not_b32 exec_lo, exec_lo |
| 621 | ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 |
| 622 | ; GFX1032: v_mov_b32_e32 {{v[0-9]+}}, 0 |
| 623 | ; GFX1032: s_not_b32 exec_lo, exec_lo |
| 624 | ; GFX1064: s_not_b64 exec, exec{{$}} |
| 625 | ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 |
| 626 | ; GFX1064: v_mov_b32_e32 {{v[0-9]+}}, 0 |
| 627 | ; GFX1064: s_not_b64 exec, exec{{$}} |
| 628 | define amdgpu_kernel void @test_set_inactive_64(i64 addrspace(1)* %out, i64 %in) #0 { |
| 629 | %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) |
| 630 | store i64 %tmp, i64 addrspace(1)* %out |
| 631 | ret void |
| 632 | } |
| 633 | |
| 634 | ; GCN-LABEL: {{^}}test_kill_i1_terminator_float: |
| 635 | ; GFX1032: s_mov_b32 exec_lo, 0 |
| 636 | ; GFX1064: s_mov_b64 exec, 0 |
| 637 | define amdgpu_ps void @test_kill_i1_terminator_float() #0 { |
| 638 | call void @llvm.amdgcn.kill(i1 false) |
| 639 | ret void |
| 640 | } |
| 641 | |
| 642 | ; GCN-LABEL: {{^}}test_kill_i1_terminator_i1: |
| 643 | ; GFX1032: s_or_b32 [[OR:s[0-9]+]], |
| 644 | ; GFX1032: s_and_b32 exec_lo, exec_lo, [[OR]] |
| 645 | ; GFX1064: s_or_b64 [[OR:s\[[0-9:]+\]]], |
| 646 | ; GFX1064: s_and_b64 exec, exec, [[OR]] |
| 647 | define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { |
| 648 | %c1 = icmp slt i32 %a, %b |
| 649 | %c2 = icmp slt i32 %c, %d |
| 650 | %x = or i1 %c1, %c2 |
| 651 | call void @llvm.amdgcn.kill(i1 %x) |
| 652 | ret void |
| 653 | } |
| 654 | |
| 655 | ; GCN-LABEL: {{^}}test_loop_vcc: |
| 656 | ; GFX1032: v_cmp_lt_f32_e32 vcc_lo, |
| 657 | ; GFX1064: v_cmp_lt_f32_e32 vcc, |
Jordan Rupprecht | f9f8128 | 2019-08-29 19:03:58 +0000 | [diff] [blame] | 658 | ; GCN: s_cbranch_vccnz |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 659 | define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { |
| 660 | entry: |
| 661 | br label %loop |
| 662 | |
| 663 | loop: |
| 664 | %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] |
| 665 | %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] |
| 666 | %cc = fcmp ogt float %ctr.iv, 7.0 |
| 667 | br i1 %cc, label %break, label %body |
| 668 | |
| 669 | body: |
| 670 | %c.iv0 = extractelement <4 x float> %c.iv, i32 0 |
| 671 | %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) |
| 672 | %ctr.next = fadd float %ctr.iv, 2.0 |
| 673 | br label %loop |
| 674 | |
| 675 | break: |
| 676 | ret <4 x float> %c.iv |
| 677 | } |
| 678 | |
| 679 | ; GCN-LABEL: {{^}}test_wwm1: |
| 680 | ; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1 |
| 681 | ; GFX1032: s_mov_b32 exec_lo, [[SAVE]] |
| 682 | ; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1 |
| 683 | ; GFX1064: s_mov_b64 exec, [[SAVE]] |
| 684 | define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { |
| 685 | main_body: |
| 686 | %out = fadd float %src0, %src1 |
| 687 | %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) |
| 688 | ret float %out.0 |
| 689 | } |
| 690 | |
| 691 | ; GCN-LABEL: {{^}}test_wwm2: |
| 692 | ; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}} |
| 693 | ; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo |
| 694 | ; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1 |
| 695 | ; GFX1032: s_mov_b32 exec_lo, [[SAVE2]] |
| 696 | ; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]] |
| 697 | ; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}} |
| 698 | ; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}} |
| 699 | ; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1 |
| 700 | ; GFX1064: s_mov_b64 exec, [[SAVE2]] |
| 701 | ; GFX1064: s_or_b64 exec, exec, [[SAVE1]] |
| 702 | define amdgpu_ps float @test_wwm2(i32 inreg %idx) { |
| 703 | main_body: |
| 704 | ; use mbcnt to make sure the branch is divergent |
| 705 | %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) |
| 706 | %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) |
| 707 | %cc = icmp uge i32 %hi, 32 |
| 708 | br i1 %cc, label %endif, label %if |
| 709 | |
| 710 | if: |
| 711 | %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0) |
| 712 | %out = fadd float %src, %src |
| 713 | %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) |
| 714 | %out.1 = fadd float %src, %out.0 |
| 715 | br label %endif |
| 716 | |
| 717 | endif: |
| 718 | %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] |
| 719 | ret float %out.2 |
| 720 | } |
| 721 | |
| 722 | ; GCN-LABEL: {{^}}test_wqm1: |
| 723 | ; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo |
| 724 | ; GFX1032: s_wqm_b32 exec_lo, exec_lo |
| 725 | ; GFX1032: s_and_b32 exec_lo, exec_lo, [[ORIG]] |
| 726 | ; GFX1064: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec{{$}} |
| 727 | ; GFX1064: s_wqm_b64 exec, exec{{$}} |
| 728 | ; GFX1064: s_and_b64 exec, exec, [[ORIG]] |
| 729 | define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 { |
| 730 | main_body: |
| 731 | %inst23 = extractelement <2 x float> %pos, i32 0 |
| 732 | %inst24 = extractelement <2 x float> %pos, i32 1 |
| 733 | %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) |
| 734 | %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) |
| 735 | %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) |
| 736 | %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) |
| 737 | %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) |
| 738 | ret <4 x float> %tex |
| 739 | } |
| 740 | |
| 741 | ; GCN-LABEL: {{^}}test_wqm2: |
| 742 | ; GFX1032: s_wqm_b32 exec_lo, exec_lo |
| 743 | ; GFX1032: s_and_b32 exec_lo, exec_lo, s{{[0-9+]}} |
| 744 | ; GFX1064: s_wqm_b64 exec, exec{{$}} |
| 745 | ; GFX1064: s_and_b64 exec, exec, s[{{[0-9:]+}}] |
| 746 | define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { |
| 747 | main_body: |
| 748 | %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0) |
| 749 | %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0) |
| 750 | %out = fadd float %src0, %src1 |
| 751 | %out.0 = bitcast float %out to i32 |
| 752 | %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) |
| 753 | %out.2 = bitcast i32 %out.1 to float |
| 754 | ret float %out.2 |
| 755 | } |
| 756 | |
| 757 | ; GCN-LABEL: {{^}}test_intr_fcmp_i64: |
| 758 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} |
| 759 | ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| |
| 760 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] |
| 761 | ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| |
| 762 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] |
| 763 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] |
| 764 | ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], |
| 765 | define amdgpu_kernel void @test_intr_fcmp_i64(i64 addrspace(1)* %out, float %src, float %a) { |
| 766 | %temp = call float @llvm.fabs.f32(float %a) |
| 767 | %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) |
| 768 | store i64 %result, i64 addrspace(1)* %out |
| 769 | ret void |
| 770 | } |
| 771 | |
| 772 | ; GCN-LABEL: {{^}}test_intr_icmp_i64: |
| 773 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], 0{{$}} |
| 774 | ; GFX1032-DAG: v_cmp_eq_u32_e64 [[C_LO:vcc_lo|s[0-9]+]], 0x64, {{s[0-9]+}} |
| 775 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[C_LO]] |
| 776 | ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], 0x64, {{s[0-9]+}} |
| 777 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] |
| 778 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[C_HI]] |
| 779 | ; GCN: store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[V_LO]]:[[V_HI]]], |
| 780 | define amdgpu_kernel void @test_intr_icmp_i64(i64 addrspace(1)* %out, i32 %src) { |
| 781 | %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) |
| 782 | store i64 %result, i64 addrspace(1)* %out |
| 783 | ret void |
| 784 | } |
| 785 | |
| 786 | ; GCN-LABEL: {{^}}test_intr_fcmp_i32: |
| 787 | ; GFX1032-DAG: v_cmp_eq_f32_e64 s[[C_LO:[0-9]+]], {{s[0-9]+}}, |{{[vs][0-9]+}}| |
| 788 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] |
| 789 | ; GFX1064: v_cmp_eq_f32_e64 s{{\[}}[[C_LO:[0-9]+]]:[[C_HI:[0-9]+]]], {{s[0-9]+}}, |{{[vs][0-9]+}}| |
| 790 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]] |
| 791 | ; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], |
| 792 | define amdgpu_kernel void @test_intr_fcmp_i32(i32 addrspace(1)* %out, float %src, float %a) { |
| 793 | %temp = call float @llvm.fabs.f32(float %a) |
| 794 | %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) |
| 795 | store i32 %result, i32 addrspace(1)* %out |
| 796 | ret void |
| 797 | } |
| 798 | |
| 799 | ; GCN-LABEL: {{^}}test_intr_icmp_i32: |
| 800 | ; GFX1032-DAG: v_cmp_eq_u32_e64 s[[C_LO:[0-9]+]], 0x64, {{s[0-9]+}} |
| 801 | ; GFX1032-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} |
| 802 | ; GFX1064: v_cmp_eq_u32_e64 s{{\[}}[[C_LO:[0-9]+]]:{{[0-9]+}}], 0x64, {{s[0-9]+}} |
| 803 | ; GFX1064-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[C_LO]]{{$}} |
| 804 | ; GCN: store_dword v[{{[0-9:]+}}], v[[V_LO]], |
| 805 | define amdgpu_kernel void @test_intr_icmp_i32(i32 addrspace(1)* %out, i32 %src) { |
| 806 | %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) |
| 807 | store i32 %result, i32 addrspace(1)* %out |
| 808 | ret void |
| 809 | } |
| 810 | |
| 811 | ; GCN-LABEL: {{^}}test_wqm_vote: |
| 812 | ; GFX1032: v_cmp_neq_f32_e32 vcc_lo, 0 |
| 813 | ; GFX1032: s_wqm_b32 [[WQM:s[0-9]+]], vcc_lo |
| 814 | ; GFX1032: s_and_b32 exec_lo, exec_lo, [[WQM]] |
| 815 | ; GFX1064: v_cmp_neq_f32_e32 vcc, 0 |
| 816 | ; GFX1064: s_wqm_b64 [[WQM:s\[[0-9:]+\]]], vcc{{$}} |
| 817 | ; GFX1064: s_and_b64 exec, exec, [[WQM]] |
| 818 | define amdgpu_ps void @test_wqm_vote(float %a) { |
| 819 | %c1 = fcmp une float %a, 0.0 |
| 820 | %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) |
| 821 | call void @llvm.amdgcn.kill(i1 %c2) |
| 822 | ret void |
| 823 | } |
| 824 | |
| 825 | ; GCN-LABEL: {{^}}test_branch_true: |
| 826 | ; GFX1032: s_and_b32 vcc_lo, exec_lo, -1 |
| 827 | ; GFX1064: s_and_b64 vcc, exec, -1 |
| 828 | define amdgpu_kernel void @test_branch_true() #2 { |
| 829 | entry: |
| 830 | br i1 true, label %for.end, label %for.body.lr.ph |
| 831 | |
| 832 | for.body.lr.ph: ; preds = %entry |
| 833 | br label %for.body |
| 834 | |
| 835 | for.body: ; preds = %for.body, %for.body.lr.ph |
| 836 | br i1 undef, label %for.end, label %for.body |
| 837 | |
| 838 | for.end: ; preds = %for.body, %entry |
| 839 | ret void |
| 840 | } |
| 841 | |
| 842 | ; GCN-LABEL: {{^}}test_ps_live: |
| 843 | ; GFX1032: s_mov_b32 [[C:s[0-9]+]], exec_lo |
| 844 | ; GFX1064: s_mov_b64 [[C:s\[[0-9:]+\]]], exec{{$}} |
| 845 | ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[C]] |
| 846 | define amdgpu_ps float @test_ps_live() #0 { |
| 847 | %live = call i1 @llvm.amdgcn.ps.live() |
| 848 | %live.32 = zext i1 %live to i32 |
| 849 | %r = bitcast i32 %live.32 to float |
| 850 | ret float %r |
| 851 | } |
| 852 | |
| 853 | ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: |
| 854 | ; GFX1032: v_cmp_neq_f64_e64 [[C:s[0-9]+]], s[{{[0-9:]+}}], 1.0 |
| 855 | ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[C]] |
| 856 | ; GFX1064: v_cmp_neq_f64_e64 [[C:s\[[0-9:]+\]]], s[{{[0-9:]+}}], 1.0 |
| 857 | ; GFX1064: s_and_b64 vcc, exec, [[C]] |
| 858 | define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { |
| 859 | entry: |
| 860 | %v = load double, double addrspace(1)* %in |
| 861 | %cc = fcmp oeq double %v, 1.000000e+00 |
| 862 | br i1 %cc, label %if, label %endif |
| 863 | |
| 864 | if: |
| 865 | %u = fadd double %v, %v |
| 866 | br label %endif |
| 867 | |
| 868 | endif: |
| 869 | %r = phi double [ %v, %entry ], [ %u, %if ] |
| 870 | store double %r, double addrspace(1)* %out |
| 871 | ret void |
| 872 | } |
| 873 | |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 874 | ; GCN-LABEL: {{^}}test_vgprblocks_w32_attr: |
| 875 | ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result |
| 876 | ; GFX10DEFWAVE: ; VGPRBlocks: 1 |
| 877 | define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e, |
| 878 | float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 { |
| 879 | main_body: |
| 880 | %s = fadd float %a, %b |
| 881 | %s.1 = fadd float %s, %c |
| 882 | %s.2 = fadd float %s.1, %d |
| 883 | %s.3 = fadd float %s.2, %e |
| 884 | %s.4 = fadd float %s.3, %f |
| 885 | %s.5 = fadd float %s.4, %g |
| 886 | %s.6 = fadd float %s.5, %h |
| 887 | %s.7 = fadd float %s.6, %i |
| 888 | %s.8 = fadd float %s.7, %j |
| 889 | %s.9 = fadd float %s.8, %k |
| 890 | %s.10 = fadd float %s.9, %l |
| 891 | ret float %s.10 |
| 892 | } |
| 893 | |
| 894 | ; GCN-LABEL: {{^}}test_vgprblocks_w64_attr: |
| 895 | ; Test that the wave size can be overridden in function attributes and that the block size is correct as a result |
Stanislav Mekhanoshin | 2594fa8 | 2019-07-31 01:07:10 +0000 | [diff] [blame] | 896 | ; GFX10DEFWAVE: ; VGPRBlocks: 2 |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 897 | define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, |
| 898 | float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { |
| 899 | main_body: |
| 900 | %s = fadd float %a, %b |
| 901 | %s.1 = fadd float %s, %c |
| 902 | %s.2 = fadd float %s.1, %d |
| 903 | %s.3 = fadd float %s.2, %e |
| 904 | %s.4 = fadd float %s.3, %f |
| 905 | %s.5 = fadd float %s.4, %g |
| 906 | %s.6 = fadd float %s.5, %h |
| 907 | %s.7 = fadd float %s.6, %i |
| 908 | %s.8 = fadd float %s.7, %j |
| 909 | %s.9 = fadd float %s.8, %k |
| 910 | %s.10 = fadd float %s.9, %l |
| 911 | ret float %s.10 |
| 912 | } |
| 913 | |
| 914 | ; GCN-LABEL: {{^}}icmp64: |
| 915 | ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v |
| 916 | ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v |
| 917 | define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { |
| 918 | entry: |
| 919 | %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 920 | %mul4 = mul nsw i32 %s, %n |
| 921 | %cmp = icmp slt i32 0, %mul4 |
| 922 | br label %if.end |
| 923 | |
| 924 | if.end: ; preds = %entry |
| 925 | %rem = urem i32 %id, %s |
| 926 | %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32) |
| 927 | %shr = lshr i64 %icmp, 1 |
| 928 | %notmask = shl nsw i64 -1, 0 |
| 929 | %and = and i64 %notmask, %shr |
| 930 | %or = or i64 %and, -9223372036854775808 |
| 931 | %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) |
| 932 | %cast = trunc i64 %cttz to i32 |
| 933 | %cmp3 = icmp ugt i32 10, %cast |
| 934 | %cmp6 = icmp ne i32 %rem, 0 |
| 935 | %brmerge = or i1 %cmp6, %cmp3 |
| 936 | br i1 %brmerge, label %if.end2, label %if.then |
| 937 | |
| 938 | if.then: ; preds = %if.end |
| 939 | unreachable |
| 940 | |
| 941 | if.end2: ; preds = %if.end |
| 942 | ret void |
| 943 | } |
| 944 | |
| 945 | ; GCN-LABEL: {{^}}fcmp64: |
| 946 | ; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v |
| 947 | ; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v |
| 948 | define amdgpu_kernel void @fcmp64(float %n, float %s) { |
| 949 | entry: |
| 950 | %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 951 | %id.f = uitofp i32 %id to float |
| 952 | %mul4 = fmul float %s, %n |
| 953 | %cmp = fcmp ult float 0.0, %mul4 |
| 954 | br label %if.end |
| 955 | |
| 956 | if.end: ; preds = %entry |
| 957 | %rem.f = frem float %id.f, %s |
| 958 | %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1) |
| 959 | %shr = lshr i64 %fcmp, 1 |
| 960 | %notmask = shl nsw i64 -1, 0 |
| 961 | %and = and i64 %notmask, %shr |
| 962 | %or = or i64 %and, -9223372036854775808 |
| 963 | %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) |
| 964 | %cast = trunc i64 %cttz to i32 |
| 965 | %cmp3 = icmp ugt i32 10, %cast |
| 966 | %cmp6 = fcmp one float %rem.f, 0.0 |
| 967 | %brmerge = or i1 %cmp6, %cmp3 |
| 968 | br i1 %brmerge, label %if.end2, label %if.then |
| 969 | |
| 970 | if.then: ; preds = %if.end |
| 971 | unreachable |
| 972 | |
| 973 | if.end2: ; preds = %if.end |
| 974 | ret void |
| 975 | } |
| 976 | |
| 977 | ; GCN-LABEL: {{^}}icmp32: |
| 978 | ; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v |
| 979 | ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v |
| 980 | define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { |
| 981 | entry: |
| 982 | %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 983 | %mul4 = mul nsw i32 %s, %n |
| 984 | %cmp = icmp slt i32 0, %mul4 |
| 985 | br label %if.end |
| 986 | |
| 987 | if.end: ; preds = %entry |
| 988 | %rem = urem i32 %id, %s |
| 989 | %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32) |
| 990 | %shr = lshr i32 %icmp, 1 |
| 991 | %notmask = shl nsw i32 -1, 0 |
| 992 | %and = and i32 %notmask, %shr |
| 993 | %or = or i32 %and, 2147483648 |
| 994 | %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) |
| 995 | %cmp3 = icmp ugt i32 10, %cttz |
| 996 | %cmp6 = icmp ne i32 %rem, 0 |
| 997 | %brmerge = or i1 %cmp6, %cmp3 |
| 998 | br i1 %brmerge, label %if.end2, label %if.then |
| 999 | |
| 1000 | if.then: ; preds = %if.end |
| 1001 | unreachable |
| 1002 | |
| 1003 | if.end2: ; preds = %if.end |
| 1004 | ret void |
| 1005 | } |
| 1006 | |
| 1007 | ; GCN-LABEL: {{^}}fcmp32: |
| 1008 | ; GFX1032: v_cmp_eq_f32_e32 vcc_lo, 0, v |
| 1009 | ; GFX1064: v_cmp_eq_f32_e32 vcc, 0, v |
| 1010 | define amdgpu_kernel void @fcmp32(float %n, float %s) { |
| 1011 | entry: |
| 1012 | %id = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 1013 | %id.f = uitofp i32 %id to float |
| 1014 | %mul4 = fmul float %s, %n |
| 1015 | %cmp = fcmp ult float 0.0, %mul4 |
| 1016 | br label %if.end |
| 1017 | |
| 1018 | if.end: ; preds = %entry |
| 1019 | %rem.f = frem float %id.f, %s |
| 1020 | %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1) |
| 1021 | %shr = lshr i32 %fcmp, 1 |
| 1022 | %notmask = shl nsw i32 -1, 0 |
| 1023 | %and = and i32 %notmask, %shr |
| 1024 | %or = or i32 %and, 2147483648 |
| 1025 | %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) |
| 1026 | %cmp3 = icmp ugt i32 10, %cttz |
| 1027 | %cmp6 = fcmp one float %rem.f, 0.0 |
| 1028 | %brmerge = or i1 %cmp6, %cmp3 |
| 1029 | br i1 %brmerge, label %if.end2, label %if.then |
| 1030 | |
| 1031 | if.then: ; preds = %if.end |
| 1032 | unreachable |
| 1033 | |
| 1034 | if.end2: ; preds = %if.end |
| 1035 | ret void |
| 1036 | } |
| 1037 | |
| 1038 | declare void @external_void_func_void() #1 |
| 1039 | |
| 1040 | ; Test save/restore of VGPR needed for SGPR spilling. |
| 1041 | |
| 1042 | ; GCN-LABEL: {{^}}callee_no_stack_with_call: |
| 1043 | ; GCN: s_waitcnt |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1044 | ; GCN-NEXT: s_waitcnt_vscnt |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1045 | |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1046 | ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| 1047 | ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} |
| 1048 | ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill |
Nicolai Haehnle | 8b7041a | 2019-07-17 11:22:57 +0000 | [diff] [blame] | 1049 | ; GCN-NEXT: v_nop |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1050 | ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] |
| 1051 | ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] |
| 1052 | |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1053 | ; GCN-NEXT: v_writelane_b32 v32, s34, 2 |
| 1054 | ; GCN: s_mov_b32 s34, s32 |
| 1055 | ; GFX1064: s_add_u32 s32, s32, 0x400 |
| 1056 | ; GFX1032: s_add_u32 s32, s32, 0x200 |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1057 | |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1058 | |
Christudasan Devadasan | b2d24bd | 2019-07-09 16:48:42 +0000 | [diff] [blame] | 1059 | ; GCN-DAG: v_writelane_b32 v32, s30, 0 |
| 1060 | ; GCN-DAG: v_writelane_b32 v32, s31, 1 |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1061 | ; GCN: s_swappc_b64 |
Christudasan Devadasan | b2d24bd | 2019-07-09 16:48:42 +0000 | [diff] [blame] | 1062 | ; GCN-DAG: v_readlane_b32 s4, v32, 0 |
| 1063 | ; GCN-DAG: v_readlane_b32 s5, v32, 1 |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1064 | |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1065 | |
| 1066 | ; GFX1064: s_sub_u32 s32, s32, 0x400 |
| 1067 | ; GFX1032: s_sub_u32 s32, s32, 0x200 |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1068 | ; GCN: v_readlane_b32 s34, v32, 2 |
| 1069 | ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| 1070 | ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} |
| 1071 | ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload |
Nicolai Haehnle | 8b7041a | 2019-07-17 11:22:57 +0000 | [diff] [blame] | 1072 | ; GCN-NEXT: v_nop |
Matt Arsenault | 71dfb7e | 2019-07-08 19:03:38 +0000 | [diff] [blame] | 1073 | ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] |
| 1074 | ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] |
| 1075 | ; GCN-NEXT: s_waitcnt vmcnt(0) |
| 1076 | ; GCN-NEXT: s_setpc_b64 |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1077 | define void @callee_no_stack_with_call() #1 { |
| 1078 | call void @external_void_func_void() |
| 1079 | ret void |
| 1080 | } |
| 1081 | |
| 1082 | |
| 1083 | declare i32 @llvm.amdgcn.workitem.id.x() |
| 1084 | declare float @llvm.fabs.f32(float) |
| 1085 | declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) |
| 1086 | declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) |
| 1087 | declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) |
| 1088 | declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) |
| 1089 | declare i1 @llvm.amdgcn.class.f32(float, i32) |
| 1090 | declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) |
| 1091 | declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) |
| 1092 | declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) |
| 1093 | declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) |
| 1094 | declare float @llvm.amdgcn.wwm.f32(float) |
| 1095 | declare i32 @llvm.amdgcn.wqm.i32(i32) |
| 1096 | declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) |
| 1097 | declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) |
| 1098 | declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) |
| 1099 | declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) |
| 1100 | declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) |
| 1101 | declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) |
| 1102 | declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) |
| 1103 | declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32) |
| 1104 | declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32) |
| 1105 | declare void @llvm.amdgcn.kill(i1) |
| 1106 | declare i1 @llvm.amdgcn.wqm.vote(i1) |
| 1107 | declare i1 @llvm.amdgcn.ps.live() |
Stanislav Mekhanoshin | 0846c12 | 2019-06-20 15:08:34 +0000 | [diff] [blame] | 1108 | declare i64 @llvm.cttz.i64(i64, i1) |
| 1109 | declare i32 @llvm.cttz.i32(i32, i1) |
| 1110 | |
| 1111 | attributes #0 = { nounwind readnone speculatable } |
| 1112 | attributes #1 = { nounwind } |
| 1113 | attributes #2 = { nounwind readnone optnone noinline } |
| 1114 | attributes #3 = { "target-features"="+wavefrontsize32" } |
| 1115 | attributes #4 = { "target-features"="+wavefrontsize64" } |