[Memcpy Loop Lowering] Remove the fixed int8 lowering.

Switch over to the lowering that uses target supplied operand types.

Differential Revision: https://reviews.llvm.org/D41201

llvm-svn: 320989
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index e1a2af6..7784672 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s
 
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
@@ -18,21 +17,14 @@
 ; Smallest static size which will be expanded
 ; OPT-LABEL: @min_size_large_static_memcpy_caller0(
 ; OPT-NOT: call
-; OPT: getelementptr
-; OPT-NEXT: load i8
-; OPT: getelementptr
-; OPT-NEXT: store i8
-
-; WOPT-LABEL: @min_size_large_static_memcpy_caller0(
-; WOPT-NOT: call
-; WOPT: br label %load-store-loop
-; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
-; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
-; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
-; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
-; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
-; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
-; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
+; OPT: br label %load-store-loop
+; OPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index
+; OPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]]
+; OPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index
+; OPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]]
+; OPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1
+; OPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025
+; OPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split
 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index c11ced0..1da1af6 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
-; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR
 
 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
 ; llvm.mem* intrinsics get lowered to loops.
@@ -18,13 +17,22 @@
   ret i8* %dst
 
 ; IR-LABEL:   @memcpy_caller
-; IR:         [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n
-; IR:         br i1 [[CMPREG]], label %split, label %loadstoreloop
-; IR:         loadstoreloop:
-; IR:         [[LOADPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64
-; IR-NEXT:    [[VAL:%[0-9]+]] = load i8, i8* [[LOADPTR]]
-; IR-NEXT:    [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
-; IR-NEXT:    store i8 [[VAL]], i8* [[STOREPTR]]
+; IR:         entry:
+; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
+; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR:         loop-memcpy-expansion:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:         [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:         store i8 [[Load]], i8* [[DstGep]]
+; IR:         [[IndexInc]] = add i64 %loop-index, 1
+; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
+; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR-LABEL:   post-loop-memcpy-expansion:
+; IR:         ret i8* %dst
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_caller
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
@@ -34,23 +42,6 @@
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
 
-; WIR-LABEL:   @memcpy_caller
-; WIR:         entry:
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR:         loop-memcpy-expansion:
-; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
-; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:         [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
-; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:         store i8 [[Load]], i8* [[DstGep]]
-; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR-LABEL:   post-loop-memcpy-expansion:
-; WIR:         ret i8* %dst
 }
 
 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -59,8 +50,23 @@
   ret i8* %dst
 
 ; IR-LABEL:   @memcpy_volatile_caller
-; IR:         load volatile
-; IR:         store volatile
+; IR:         entry:
+; IR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
+; IR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR:         loop-memcpy-expansion:
+; IR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
+; IR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:         [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
+; IR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:         store volatile i8 [[Load]], i8* [[DstGep]]
+; IR:         [[IndexInc]] = add i64 %loop-index, 1
+; IR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
+; IR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
+
+; IR-LABEL:   post-loop-memcpy-expansion:
+; IR:         ret i8* %dst
+
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
@@ -69,24 +75,6 @@
 ; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
 ; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
 ; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
-
-; WIR-LABEL:   @memcpy_volatile_caller
-; WIR:         entry:
-; WIR:         [[Cond:%[0-9]+]] = icmp ne i64 %n, 0
-; WIR:         br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR:         loop-memcpy-expansion:
-; WIR:         %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ]
-; WIR:         [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:         [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]]
-; WIR:         [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:         store volatile i8 [[Load]], i8* [[DstGep]]
-; WIR:         [[IndexInc]] = add i64 %loop-index, 1
-; WIR:         [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n
-; WIR:         br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion
-
-; WIR-LABEL:   post-loop-memcpy-expansion:
-; WIR:         ret i8* %dst
 }
 
 define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -102,12 +90,6 @@
 ; IR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
 ; IR:         getelementptr inbounds i8, i8* [[SRCCAST]]
 ; IR:         getelementptr inbounds i8, i8* [[DSTCAST]]
-
-; WIR-LABEL:   @memcpy_casting_caller
-; WIR:         [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8*
-; WIR:         [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8*
-; WIR:         getelementptr inbounds i8, i8* [[SRCCAST]]
-; WIR:         getelementptr inbounds i8, i8* [[DSTCAST]]
 }
 
 define i8* @memcpy_known_size(i8* %dst, i8* %src) {
@@ -116,18 +98,18 @@
   ret i8* %dst
 
 ; Check that calls with compile-time constant size are handled correctly
-; WIR-LABEL:    @memcpy_known_size
-; WIR:          entry:
-; WIR:          br label %load-store-loop
-; WIR:          load-store-loop:
-; WIR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
-; WIR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
-; WIR:          [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
-; WIR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
-; WIR:          store i8 [[Load]], i8* [[DstGep]]
-; WIR:          [[IndexInc]] = add i64 %loop-index, 1
-; WIR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
-; WIR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
+; IR-LABEL:    @memcpy_known_size
+; IR:          entry:
+; IR:          br label %load-store-loop
+; IR:          load-store-loop:
+; IR:          %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ]
+; IR:          [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index
+; IR:          [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]]
+; IR:          [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index
+; IR:          store i8 [[Load]], i8* [[DstGep]]
+; IR:          [[IndexInc]] = add i64 %loop-index, 1
+; IR:          [[Cond:%[0-9]+]] = icmp ult i64 %3, 144
+; IR:          br i1 [[Cond]], label %load-store-loop, label %memcpy-split
 }
 
 define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {