[AMDGPU] Transform __read_pipe_* and __write_pipe_*
When packet size equals packet align and is power of 2, transform
__read_pipe* and __write_pipe* to specialized library function.
Differential Revision: https://reviews.llvm.org/D36831
llvm-svn: 312598
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index eab1fe4..47eb9a9 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -299,8 +299,8 @@
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -314,8 +314,8 @@
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -331,8 +331,8 @@
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
-; GCN: %0 = fmul fast float %__powx21, %__powx21
-; GCN: %__powprod3 = fmul fast float %0, %__powx22
+; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22
define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
@@ -350,12 +350,12 @@
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
-; GCN-PRELINK: %0 = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
-; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
-; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
-; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
+; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
@@ -393,12 +393,12 @@
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
; GCN-PRELINK: %__yeven = shl i32 %conv, 31
-; GCN-PRELINK: %0 = bitcast float %tmp to i32
-; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
-; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
-; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
-; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
-; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
+; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
+; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]]
+; GCN-PRELINK: %[[r3:.*]] = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %[[r2]], i32 addrspace(1)* %[[r3]], align 4
define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
@@ -692,3 +692,96 @@
}
declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
+
+%opencl.pipe_t = type opaque
+%opencl.reserve_id_t = type opaque
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND:[0-9]+]]
+; GCN-PRELINK: call i32 @__read_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+entry:
+ %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
+ %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+ %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4)
+ ret void
+}
+
+declare i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32)
+
+declare %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32)
+
+declare i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32)
+
+declare void @__commit_read_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr)
+; GCN-PRELINK: call i32 @__write_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}}, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__write_pipe_4_4(%opencl.pipe_t addrspace(1)* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 2, i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 addrspace(1)* %ptr) local_unnamed_addr {
+entry:
+ %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
+ %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8 addrspace(4)*
+ %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+ %tmp3 = tail call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+ %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 2, i8 addrspace(4)* %tmp1, i32 4, i32 4) #0
+ tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t* %tmp3, i32 4, i32 4) #0
+ ret void
+}
+
+declare i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)*, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+
+declare %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)*, i32, i32, i32) local_unnamed_addr
+
+declare i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i8 addrspace(4)*, i32, i32) local_unnamed_addr
+
+declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_id_t*, i32, i32) local_unnamed_addr
+
+%struct.S = type { [100 x i32] }
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size
+; GCN-PRELINK: call i32 @__read_pipe_2_1(%opencl.pipe_t addrspace(1)* %{{.*}} i8 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_2(%opencl.pipe_t addrspace(1)* %{{.*}} i16 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_4(%opencl.pipe_t addrspace(1)* %{{.*}} i32 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_8(%opencl.pipe_t addrspace(1)* %{{.*}} i64 addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_16(%opencl.pipe_t addrspace(1)* %{{.*}}, <2 x i64> addrspace(4)* %{{.*}}) #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_32(%opencl.pipe_t addrspace(1)* %{{.*}}, <4 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_64(%opencl.pipe_t addrspace(1)* %{{.*}}, <8 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2_128(%opencl.pipe_t addrspace(1)* %{{.*}}, <16 x i64> addrspace(4)* %{{.*}} #[[NOUNWIND]]
+; GCN-PRELINK: call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %{{.*}}, i8 addrspace(4)* %{{.*}} i32 400, i32 4) #[[NOUNWIND]]
+define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
+entry:
+ %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8 addrspace(4)*
+ %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(4)* %tmp, i32 1, i32 1) #0
+ %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
+ %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8 addrspace(4)*
+ %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8 addrspace(4)* %tmp3, i32 2, i32 2) #0
+ %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
+ %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8 addrspace(4)*
+ %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8 addrspace(4)* %tmp6, i32 4, i32 4) #0
+ %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
+ %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8 addrspace(4)*
+ %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8 addrspace(4)* %tmp9, i32 8, i32 8) #0
+ %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
+ %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8 addrspace(4)*
+ %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8 addrspace(4)* %tmp12, i32 16, i32 16) #0
+ %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
+ %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8 addrspace(4)*
+ %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8 addrspace(4)* %tmp15, i32 32, i32 32) #0
+ %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
+ %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8 addrspace(4)*
+ %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8 addrspace(4)* %tmp18, i32 64, i32 64) #0
+ %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
+ %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8 addrspace(4)*
+ %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8 addrspace(4)* %tmp21, i32 128, i32 128) #0
+ %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
+ %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8 addrspace(4)*
+ %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8 addrspace(4)* %tmp24, i32 400, i32 4) #0
+ ret void
+}
+
+; CGN-PRELINK: attributes #[[NOUNWIND]] = { nounwind }
+attributes #0 = { nounwind }