[X86] Reduce Store Forward Block issues in HW - Recommit after fixing Bug 36346
If a load follows a store and reloads data that the store has written to memory, Intel microarchitectures can in many cases forward the data directly from the store to the load, This "store forwarding" saves cycles by enabling the load to directly obtain the data instead of accessing the data from cache or memory.
A "store forward block" occurs in cases that a store cannot be forwarded to the load. The most typical case of store forward block on Intel Core microarchiticutre that a small store cannot be forwarded to a large load.
The estimated penalty for a store forward block is ~13 cycles.
This pass tries to recognize and handle cases where "store forward block" is created by the compiler when lowering memcpy calls to a sequence
of a load and a store.
The pass currently only handles cases where memcpy is lowered to XMM/YMM registers, it tries to break the memcpy into smaller copies.
breaking the memcpy should be possible since there is no atomicity guarantee for loads and stores to XMM/YMM.
Differential revision: https://reviews.llvm.org/D41330
Change-Id: Ib48836ccdf6005989f7d4466fa2035b7b04415d9
llvm-svn: 328973
diff --git a/llvm/test/CodeGen/X86/avoid-sfb.ll b/llvm/test/CodeGen/X86/avoid-sfb.ll
new file mode 100644
index 0000000..0e2d292
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avoid-sfb.ll
@@ -0,0 +1,1491 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux --x86-disable-avoid-SFB | FileCheck %s --check-prefix=DISABLED
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx | FileCheck %s -check-prefix=CHECK-AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @test_conditional_block(%struct.S* nocapture noalias %s1 , %struct.S* nocapture noalias %s2, i32 %x, %struct.S* nocapture noalias %s3, %struct.S* nocapture noalias readonly %s4) local_unnamed_addr #0 {
+; CHECK-LABEL: test_conditional_block:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl %edx, 4(%rdi)
+; CHECK-NEXT: .LBB0_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movl 4(%rdi), %eax
+; CHECK-NEXT: movl %eax, 4(%rsi)
+; CHECK-NEXT: movq 8(%rdi), %rax
+; CHECK-NEXT: movq %rax, 8(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_conditional_block:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB0_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl %edx, 4(%rdi)
+; DISABLED-NEXT: .LBB0_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_conditional_block:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB0_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX2-NEXT: .LBB0_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, (%rsi)
+; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_conditional_block:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB0_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX512-NEXT: .LBB0_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, (%rsi)
+; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
+ store i32 %x, i32* %b, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S* %s3 to i8*
+ %1 = bitcast %struct.S* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ %2 = bitcast %struct.S* %s2 to i8*
+ %3 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @test_imm_store(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 {
+; CHECK-LABEL: test_imm_store:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $0, (%rdi)
+; CHECK-NEXT: movl $1, (%rcx)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movq 4(%rdi), %rax
+; CHECK-NEXT: movq %rax, 4(%rsi)
+; CHECK-NEXT: movl 12(%rdi), %eax
+; CHECK-NEXT: movl %eax, 12(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_imm_store:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl $0, (%rdi)
+; DISABLED-NEXT: movl $1, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_imm_store:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl $0, (%rdi)
+; CHECK-AVX2-NEXT: movl $1, (%rcx)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, (%rsi)
+; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
+; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_imm_store:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl $0, (%rdi)
+; CHECK-AVX512-NEXT: movl $1, (%rcx)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, (%rsi)
+; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
+; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %a = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 0
+ store i32 0, i32* %a, align 4
+ %a1 = getelementptr inbounds %struct.S, %struct.S* %s3, i64 0, i32 0
+ store i32 1, i32* %a1, align 4
+ %0 = bitcast %struct.S* %s2 to i8*
+ %1 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @test_nondirect_br(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
+; CHECK-LABEL: test_nondirect_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB2_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl %edx, 4(%rdi)
+; CHECK-NEXT: .LBB2_2: # %if.end
+; CHECK-NEXT: cmpl $14, %r9d
+; CHECK-NEXT: jl .LBB2_4
+; CHECK-NEXT: # %bb.3: # %if.then2
+; CHECK-NEXT: movl %r9d, 12(%rdi)
+; CHECK-NEXT: .LBB2_4: # %if.end3
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: movl 8(%rdi), %eax
+; CHECK-NEXT: movl %eax, 8(%rsi)
+; CHECK-NEXT: movl 12(%rdi), %eax
+; CHECK-NEXT: movl %eax, 12(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_nondirect_br:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB2_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl %edx, 4(%rdi)
+; DISABLED-NEXT: .LBB2_2: # %if.end
+; DISABLED-NEXT: cmpl $14, %r9d
+; DISABLED-NEXT: jl .LBB2_4
+; DISABLED-NEXT: # %bb.3: # %if.then2
+; DISABLED-NEXT: movl %r9d, 12(%rdi)
+; DISABLED-NEXT: .LBB2_4: # %if.end3
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_nondirect_br:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB2_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX2-NEXT: .LBB2_2: # %if.end
+; CHECK-AVX2-NEXT: cmpl $14, %r9d
+; CHECK-AVX2-NEXT: jl .LBB2_4
+; CHECK-AVX2-NEXT: # %bb.3: # %if.then2
+; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3
+; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX2-NEXT: movq (%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, (%rsi)
+; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
+; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_nondirect_br:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB2_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX512-NEXT: .LBB2_2: # %if.end
+; CHECK-AVX512-NEXT: cmpl $14, %r9d
+; CHECK-AVX512-NEXT: jl .LBB2_4
+; CHECK-AVX512-NEXT: # %bb.3: # %if.then2
+; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3
+; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX512-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, (%rsi)
+; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
+; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
+ store i32 %x, i32* %b, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %cmp1 = icmp sgt i32 %x2, 13
+ br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2: ; preds = %if.end
+ %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
+ store i32 %x2, i32* %d, align 4
+ br label %if.end3
+
+if.end3: ; preds = %if.then2, %if.end
+ %0 = bitcast %struct.S* %s3 to i8*
+ %1 = bitcast %struct.S* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ %2 = bitcast %struct.S* %s2 to i8*
+ %3 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @test_2preds_block(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
+; CHECK-LABEL: test_2preds_block:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl %r9d, 12(%rdi)
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB3_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl %edx, 4(%rdi)
+; CHECK-NEXT: .LBB3_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movl 4(%rdi), %eax
+; CHECK-NEXT: movl %eax, 4(%rsi)
+; CHECK-NEXT: movl 8(%rdi), %eax
+; CHECK-NEXT: movl %eax, 8(%rsi)
+; CHECK-NEXT: movl 12(%rdi), %eax
+; CHECK-NEXT: movl %eax, 12(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_2preds_block:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl %r9d, 12(%rdi)
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB3_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl %edx, 4(%rdi)
+; DISABLED-NEXT: .LBB3_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_2preds_block:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB3_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX2-NEXT: .LBB3_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, (%rsi)
+; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX2-NEXT: movl 8(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 8(%rsi)
+; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_2preds_block:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB3_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl %edx, 4(%rdi)
+; CHECK-AVX512-NEXT: .LBB3_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, (%rsi)
+; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX512-NEXT: movl 8(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 8(%rsi)
+; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
+ store i32 %x2, i32* %d, align 4
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
+ store i32 %x, i32* %b, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S* %s3 to i8*
+ %1 = bitcast %struct.S* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ %2 = bitcast %struct.S* %s2 to i8*
+ %3 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
+ ret void
+}
+%struct.S2 = type { i64, i64 }
+
+; Function Attrs: nounwind uwtable
+define void @test_type64(%struct.S2* nocapture noalias %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 {
+; CHECK-LABEL: test_type64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB4_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movslq %edx, %rax
+; CHECK-NEXT: movq %rax, 8(%rdi)
+; CHECK-NEXT: .LBB4_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: movq 8(%rdi), %rax
+; CHECK-NEXT: movq %rax, 8(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_type64:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB4_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movslq %edx, %rax
+; DISABLED-NEXT: movq %rax, 8(%rdi)
+; DISABLED-NEXT: .LBB4_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_type64:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB4_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movslq %edx, %rax
+; CHECK-AVX2-NEXT: movq %rax, 8(%rdi)
+; CHECK-AVX2-NEXT: .LBB4_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX2-NEXT: movq (%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, (%rsi)
+; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_type64:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB4_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movslq %edx, %rax
+; CHECK-AVX512-NEXT: movq %rax, 8(%rdi)
+; CHECK-AVX512-NEXT: .LBB4_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX512-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, (%rsi)
+; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %conv = sext i32 %x to i64
+ %b = getelementptr inbounds %struct.S2, %struct.S2* %s1, i64 0, i32 1
+ store i64 %conv, i64* %b, align 8
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S2* %s3 to i8*
+ %1 = bitcast %struct.S2* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
+ %2 = bitcast %struct.S2* %s2 to i8*
+ %3 = bitcast %struct.S2* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 8, i1 false)
+ ret void
+}
+%struct.S3 = type { i64, i8, i8, i16, i32 }
+
+; Function Attrs: noinline nounwind uwtable
+define void @test_mixed_type(%struct.S3* nocapture noalias %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mixed_type:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB5_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movslq %edx, %rax
+; CHECK-NEXT: movq %rax, (%rdi)
+; CHECK-NEXT: movb %dl, 8(%rdi)
+; CHECK-NEXT: .LBB5_2: # %if.end
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: movb 8(%rdi), %al
+; CHECK-NEXT: movb %al, 8(%rsi)
+; CHECK-NEXT: movl 9(%rdi), %eax
+; CHECK-NEXT: movl %eax, 9(%rsi)
+; CHECK-NEXT: movzwl 13(%rdi), %eax
+; CHECK-NEXT: movw %ax, 13(%rsi)
+; CHECK-NEXT: movb 15(%rdi), %al
+; CHECK-NEXT: movb %al, 15(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_mixed_type:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB5_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movslq %edx, %rax
+; DISABLED-NEXT: movq %rax, (%rdi)
+; DISABLED-NEXT: movb %dl, 8(%rdi)
+; DISABLED-NEXT: .LBB5_2: # %if.end
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_mixed_type:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB5_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movslq %edx, %rax
+; CHECK-AVX2-NEXT: movq %rax, (%rdi)
+; CHECK-AVX2-NEXT: movb %dl, 8(%rdi)
+; CHECK-AVX2-NEXT: .LBB5_2: # %if.end
+; CHECK-AVX2-NEXT: movq (%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, (%rsi)
+; CHECK-AVX2-NEXT: movb 8(%rdi), %al
+; CHECK-AVX2-NEXT: movb %al, 8(%rsi)
+; CHECK-AVX2-NEXT: movl 9(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 9(%rsi)
+; CHECK-AVX2-NEXT: movzwl 13(%rdi), %eax
+; CHECK-AVX2-NEXT: movw %ax, 13(%rsi)
+; CHECK-AVX2-NEXT: movb 15(%rdi), %al
+; CHECK-AVX2-NEXT: movb %al, 15(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_mixed_type:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB5_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movslq %edx, %rax
+; CHECK-AVX512-NEXT: movq %rax, (%rdi)
+; CHECK-AVX512-NEXT: movb %dl, 8(%rdi)
+; CHECK-AVX512-NEXT: .LBB5_2: # %if.end
+; CHECK-AVX512-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, (%rsi)
+; CHECK-AVX512-NEXT: movb 8(%rdi), %al
+; CHECK-AVX512-NEXT: movb %al, 8(%rsi)
+; CHECK-AVX512-NEXT: movl 9(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 9(%rsi)
+; CHECK-AVX512-NEXT: movzwl 13(%rdi), %eax
+; CHECK-AVX512-NEXT: movw %ax, 13(%rsi)
+; CHECK-AVX512-NEXT: movb 15(%rdi), %al
+; CHECK-AVX512-NEXT: movb %al, 15(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %conv = sext i32 %x to i64
+ %a = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 0
+ store i64 %conv, i64* %a, align 8
+ %conv1 = trunc i32 %x to i8
+ %b = getelementptr inbounds %struct.S3, %struct.S3* %s1, i64 0, i32 1
+ store i8 %conv1, i8* %b, align 8
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S3* %s2 to i8*
+ %1 = bitcast %struct.S3* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 8, i1 false)
+ ret void
+}
+%struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @test_multiple_blocks(%struct.S4* nocapture noalias %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 {
+; CHECK-LABEL: test_multiple_blocks:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $0, 4(%rdi)
+; CHECK-NEXT: movl $0, 36(%rdi)
+; CHECK-NEXT: movups 16(%rdi), %xmm0
+; CHECK-NEXT: movups %xmm0, 16(%rsi)
+; CHECK-NEXT: movl 32(%rdi), %eax
+; CHECK-NEXT: movl %eax, 32(%rsi)
+; CHECK-NEXT: movl 36(%rdi), %eax
+; CHECK-NEXT: movl %eax, 36(%rsi)
+; CHECK-NEXT: movq 40(%rdi), %rax
+; CHECK-NEXT: movq %rax, 40(%rsi)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movl 4(%rdi), %eax
+; CHECK-NEXT: movl %eax, 4(%rsi)
+; CHECK-NEXT: movq 8(%rdi), %rax
+; CHECK-NEXT: movq %rax, 8(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_multiple_blocks:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl $0, 4(%rdi)
+; DISABLED-NEXT: movl $0, 36(%rdi)
+; DISABLED-NEXT: movups 16(%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, 16(%rsi)
+; DISABLED-NEXT: movups 32(%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, 32(%rsi)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_multiple_blocks:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl $0, 4(%rdi)
+; CHECK-AVX2-NEXT: movl $0, 36(%rdi)
+; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
+; CHECK-AVX2-NEXT: movl 32(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 32(%rsi)
+; CHECK-AVX2-NEXT: movl 36(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 36(%rsi)
+; CHECK-AVX2-NEXT: movq 40(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 40(%rsi)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, (%rsi)
+; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
+; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_multiple_blocks:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl $0, 4(%rdi)
+; CHECK-AVX512-NEXT: movl $0, 36(%rdi)
+; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
+; CHECK-AVX512-NEXT: movl 32(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 32(%rsi)
+; CHECK-AVX512-NEXT: movl 36(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 36(%rsi)
+; CHECK-AVX512-NEXT: movq 40(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 40(%rsi)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, (%rsi)
+; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
+; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %b = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 1
+ store i32 0, i32* %b, align 4
+ %b3 = getelementptr inbounds %struct.S4, %struct.S4* %s1, i64 0, i32 9
+ store i32 0, i32* %b3, align 4
+ %0 = bitcast %struct.S4* %s2 to i8*
+ %1 = bitcast %struct.S4* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 48, i32 4, i1 false)
+ ret void
+}
+%struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 }
+
+; Function Attrs: nounwind uwtable
+define void @test_type16(%struct.S5* nocapture noalias %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 {
+; CHECK-LABEL: test_type16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB7_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movw %dx, 2(%rdi)
+; CHECK-NEXT: .LBB7_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movzwl (%rdi), %eax
+; CHECK-NEXT: movw %ax, (%rsi)
+; CHECK-NEXT: movzwl 2(%rdi), %eax
+; CHECK-NEXT: movw %ax, 2(%rsi)
+; CHECK-NEXT: movq 4(%rdi), %rax
+; CHECK-NEXT: movq %rax, 4(%rsi)
+; CHECK-NEXT: movl 12(%rdi), %eax
+; CHECK-NEXT: movl %eax, 12(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_type16:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB7_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movw %dx, 2(%rdi)
+; DISABLED-NEXT: .LBB7_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_type16:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB7_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movw %dx, 2(%rdi)
+; CHECK-AVX2-NEXT: .LBB7_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX2-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX2-NEXT: movw %ax, (%rsi)
+; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax
+; CHECK-AVX2-NEXT: movw %ax, 2(%rsi)
+; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 4(%rsi)
+; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_type16:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB7_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movw %dx, 2(%rdi)
+; CHECK-AVX512-NEXT: .LBB7_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx)
+; CHECK-AVX512-NEXT: movzwl (%rdi), %eax
+; CHECK-AVX512-NEXT: movw %ax, (%rsi)
+; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax
+; CHECK-AVX512-NEXT: movw %ax, 2(%rsi)
+; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 4(%rsi)
+; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 12(%rsi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %conv = trunc i32 %x to i16
+ %b = getelementptr inbounds %struct.S5, %struct.S5* %s1, i64 0, i32 1
+ store i16 %conv, i16* %b, align 2
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S5* %s3 to i8*
+ %1 = bitcast %struct.S5* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 2, i1 false)
+ %2 = bitcast %struct.S5* %s2 to i8*
+ %3 = bitcast %struct.S5* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 2, i1 false)
+ ret void
+}
+
+%struct.S6 = type { [4 x i32], i32, i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 {
+; CHECK-LABEL: test_stack:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: movups %xmm0, (%rdi)
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movq %rax, 16(%rdi)
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %eax, 24(%rdi)
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl %eax, 28(%rdi)
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_stack:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl %esi, {{[0-9]+}}(%rsp)
+; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%rdi)
+; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; DISABLED-NEXT: movups %xmm0, 16(%rdi)
+; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1
+; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
+; DISABLED-NEXT: movq %rdi, %rax
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_stack:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-AVX2-NEXT: movq %rax, 16(%rdi)
+; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX2-NEXT: movl %eax, 24(%rdi)
+; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX2-NEXT: movl %eax, 28(%rdi)
+; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-AVX2-NEXT: movq %rdi, %rax
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_stack:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl %esi, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-AVX512-NEXT: movq %rax, 16(%rdi)
+; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX512-NEXT: movl %eax, 24(%rdi)
+; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX512-NEXT: movl %eax, 28(%rdi)
+; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; CHECK-AVX512-NEXT: movq %rdi, %rax
+; CHECK-AVX512-NEXT: retq
+entry:
+ %s6.sroa.0.0..sroa_cast1 = bitcast %struct.S6* %s2 to i8*
+ %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3
+ store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8
+ %0 = bitcast %struct.S6* %agg.result to i8*
+ %s6.sroa.0.0..sroa_cast2 = bitcast %struct.S6* %s1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %s6.sroa.0.0..sroa_cast2, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false)
+
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @test_limit_all(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
+; CHECK-LABEL: test_limit_all:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset %rbx, -48
+; CHECK-NEXT: .cfi_offset %r12, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %r8, %r15
+; CHECK-NEXT: movq %rcx, %r14
+; CHECK-NEXT: movl %edx, %ebp
+; CHECK-NEXT: movq %rsi, %r12
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movl %r9d, 12(%rdi)
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: cmpl $18, %ebp
+; CHECK-NEXT: jl .LBB9_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl %ebp, 4(%rbx)
+; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: .LBB9_2: # %if.end
+; CHECK-NEXT: movups (%r15), %xmm0
+; CHECK-NEXT: movups %xmm0, (%r14)
+; CHECK-NEXT: movups (%rbx), %xmm0
+; CHECK-NEXT: movups %xmm0, (%r12)
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_limit_all:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: pushq %rbp
+; DISABLED-NEXT: .cfi_def_cfa_offset 16
+; DISABLED-NEXT: pushq %r15
+; DISABLED-NEXT: .cfi_def_cfa_offset 24
+; DISABLED-NEXT: pushq %r14
+; DISABLED-NEXT: .cfi_def_cfa_offset 32
+; DISABLED-NEXT: pushq %r12
+; DISABLED-NEXT: .cfi_def_cfa_offset 40
+; DISABLED-NEXT: pushq %rbx
+; DISABLED-NEXT: .cfi_def_cfa_offset 48
+; DISABLED-NEXT: .cfi_offset %rbx, -48
+; DISABLED-NEXT: .cfi_offset %r12, -40
+; DISABLED-NEXT: .cfi_offset %r14, -32
+; DISABLED-NEXT: .cfi_offset %r15, -24
+; DISABLED-NEXT: .cfi_offset %rbp, -16
+; DISABLED-NEXT: movq %r8, %r15
+; DISABLED-NEXT: movq %rcx, %r14
+; DISABLED-NEXT: movl %edx, %ebp
+; DISABLED-NEXT: movq %rsi, %r12
+; DISABLED-NEXT: movq %rdi, %rbx
+; DISABLED-NEXT: movl %r9d, 12(%rdi)
+; DISABLED-NEXT: callq bar
+; DISABLED-NEXT: cmpl $18, %ebp
+; DISABLED-NEXT: jl .LBB9_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl %ebp, 4(%rbx)
+; DISABLED-NEXT: movq %rbx, %rdi
+; DISABLED-NEXT: callq bar
+; DISABLED-NEXT: .LBB9_2: # %if.end
+; DISABLED-NEXT: movups (%r15), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%r14)
+; DISABLED-NEXT: movups (%rbx), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%r12)
+; DISABLED-NEXT: popq %rbx
+; DISABLED-NEXT: popq %r12
+; DISABLED-NEXT: popq %r14
+; DISABLED-NEXT: popq %r15
+; DISABLED-NEXT: popq %rbp
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_limit_all:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: pushq %rbp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
+; CHECK-AVX2-NEXT: pushq %r15
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
+; CHECK-AVX2-NEXT: pushq %r14
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
+; CHECK-AVX2-NEXT: pushq %r12
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
+; CHECK-AVX2-NEXT: pushq %rbx
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: .cfi_offset %rbx, -48
+; CHECK-AVX2-NEXT: .cfi_offset %r12, -40
+; CHECK-AVX2-NEXT: .cfi_offset %r14, -32
+; CHECK-AVX2-NEXT: .cfi_offset %r15, -24
+; CHECK-AVX2-NEXT: .cfi_offset %rbp, -16
+; CHECK-AVX2-NEXT: movq %r8, %r15
+; CHECK-AVX2-NEXT: movq %rcx, %r14
+; CHECK-AVX2-NEXT: movl %edx, %ebp
+; CHECK-AVX2-NEXT: movq %rsi, %r12
+; CHECK-AVX2-NEXT: movq %rdi, %rbx
+; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX2-NEXT: callq bar
+; CHECK-AVX2-NEXT: cmpl $18, %ebp
+; CHECK-AVX2-NEXT: jl .LBB9_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx)
+; CHECK-AVX2-NEXT: movq %rbx, %rdi
+; CHECK-AVX2-NEXT: callq bar
+; CHECK-AVX2-NEXT: .LBB9_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14)
+; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12)
+; CHECK-AVX2-NEXT: popq %rbx
+; CHECK-AVX2-NEXT: popq %r12
+; CHECK-AVX2-NEXT: popq %r14
+; CHECK-AVX2-NEXT: popq %r15
+; CHECK-AVX2-NEXT: popq %rbp
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_limit_all:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: pushq %rbp
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
+; CHECK-AVX512-NEXT: pushq %r15
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
+; CHECK-AVX512-NEXT: pushq %r14
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
+; CHECK-AVX512-NEXT: pushq %r12
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
+; CHECK-AVX512-NEXT: pushq %rbx
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX512-NEXT: .cfi_offset %rbx, -48
+; CHECK-AVX512-NEXT: .cfi_offset %r12, -40
+; CHECK-AVX512-NEXT: .cfi_offset %r14, -32
+; CHECK-AVX512-NEXT: .cfi_offset %r15, -24
+; CHECK-AVX512-NEXT: .cfi_offset %rbp, -16
+; CHECK-AVX512-NEXT: movq %r8, %r15
+; CHECK-AVX512-NEXT: movq %rcx, %r14
+; CHECK-AVX512-NEXT: movl %edx, %ebp
+; CHECK-AVX512-NEXT: movq %rsi, %r12
+; CHECK-AVX512-NEXT: movq %rdi, %rbx
+; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX512-NEXT: callq bar
+; CHECK-AVX512-NEXT: cmpl $18, %ebp
+; CHECK-AVX512-NEXT: jl .LBB9_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx)
+; CHECK-AVX512-NEXT: movq %rbx, %rdi
+; CHECK-AVX512-NEXT: callq bar
+; CHECK-AVX512-NEXT: .LBB9_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14)
+; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12)
+; CHECK-AVX512-NEXT: popq %rbx
+; CHECK-AVX512-NEXT: popq %r12
+; CHECK-AVX512-NEXT: popq %r14
+; CHECK-AVX512-NEXT: popq %r15
+; CHECK-AVX512-NEXT: popq %rbp
+; CHECK-AVX512-NEXT: retq
+entry:
+ %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
+ store i32 %x2, i32* %d, align 4
+ tail call void @bar(%struct.S* %s1) #3
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
+ store i32 %x, i32* %b, align 4
+ tail call void @bar(%struct.S* nonnull %s1) #3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S* %s3 to i8*
+ %1 = bitcast %struct.S* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ %2 = bitcast %struct.S* %s2 to i8*
+ %3 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @test_limit_one_pred(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 {
+; CHECK-LABEL: test_limit_one_pred:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset %rbx, -40
+; CHECK-NEXT: .cfi_offset %r12, -32
+; CHECK-NEXT: .cfi_offset %r14, -24
+; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: movq %r8, %r12
+; CHECK-NEXT: movq %rcx, %r15
+; CHECK-NEXT: movq %rsi, %r14
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movl %r9d, 12(%rdi)
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB10_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl %edx, 4(%rbx)
+; CHECK-NEXT: movq %rbx, %rdi
+; CHECK-NEXT: callq bar
+; CHECK-NEXT: .LBB10_2: # %if.end
+; CHECK-NEXT: movups (%r12), %xmm0
+; CHECK-NEXT: movups %xmm0, (%r15)
+; CHECK-NEXT: movq (%rbx), %rax
+; CHECK-NEXT: movq %rax, (%r14)
+; CHECK-NEXT: movl 8(%rbx), %eax
+; CHECK-NEXT: movl %eax, 8(%r14)
+; CHECK-NEXT: movl 12(%rbx), %eax
+; CHECK-NEXT: movl %eax, 12(%r14)
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_limit_one_pred:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: pushq %r15
+; DISABLED-NEXT: .cfi_def_cfa_offset 16
+; DISABLED-NEXT: pushq %r14
+; DISABLED-NEXT: .cfi_def_cfa_offset 24
+; DISABLED-NEXT: pushq %r12
+; DISABLED-NEXT: .cfi_def_cfa_offset 32
+; DISABLED-NEXT: pushq %rbx
+; DISABLED-NEXT: .cfi_def_cfa_offset 40
+; DISABLED-NEXT: pushq %rax
+; DISABLED-NEXT: .cfi_def_cfa_offset 48
+; DISABLED-NEXT: .cfi_offset %rbx, -40
+; DISABLED-NEXT: .cfi_offset %r12, -32
+; DISABLED-NEXT: .cfi_offset %r14, -24
+; DISABLED-NEXT: .cfi_offset %r15, -16
+; DISABLED-NEXT: movq %r8, %r15
+; DISABLED-NEXT: movq %rcx, %r14
+; DISABLED-NEXT: movq %rsi, %r12
+; DISABLED-NEXT: movq %rdi, %rbx
+; DISABLED-NEXT: movl %r9d, 12(%rdi)
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB10_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl %edx, 4(%rbx)
+; DISABLED-NEXT: movq %rbx, %rdi
+; DISABLED-NEXT: callq bar
+; DISABLED-NEXT: .LBB10_2: # %if.end
+; DISABLED-NEXT: movups (%r15), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%r14)
+; DISABLED-NEXT: movups (%rbx), %xmm0
+; DISABLED-NEXT: movups %xmm0, (%r12)
+; DISABLED-NEXT: addq $8, %rsp
+; DISABLED-NEXT: popq %rbx
+; DISABLED-NEXT: popq %r12
+; DISABLED-NEXT: popq %r14
+; DISABLED-NEXT: popq %r15
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_limit_one_pred:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: pushq %r15
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16
+; CHECK-AVX2-NEXT: pushq %r14
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24
+; CHECK-AVX2-NEXT: pushq %r12
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32
+; CHECK-AVX2-NEXT: pushq %rbx
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40
+; CHECK-AVX2-NEXT: pushq %rax
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: .cfi_offset %rbx, -40
+; CHECK-AVX2-NEXT: .cfi_offset %r12, -32
+; CHECK-AVX2-NEXT: .cfi_offset %r14, -24
+; CHECK-AVX2-NEXT: .cfi_offset %r15, -16
+; CHECK-AVX2-NEXT: movq %r8, %r12
+; CHECK-AVX2-NEXT: movq %rcx, %r15
+; CHECK-AVX2-NEXT: movq %rsi, %r14
+; CHECK-AVX2-NEXT: movq %rdi, %rbx
+; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB10_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl %edx, 4(%rbx)
+; CHECK-AVX2-NEXT: movq %rbx, %rdi
+; CHECK-AVX2-NEXT: callq bar
+; CHECK-AVX2-NEXT: .LBB10_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15)
+; CHECK-AVX2-NEXT: movq (%rbx), %rax
+; CHECK-AVX2-NEXT: movq %rax, (%r14)
+; CHECK-AVX2-NEXT: movl 8(%rbx), %eax
+; CHECK-AVX2-NEXT: movl %eax, 8(%r14)
+; CHECK-AVX2-NEXT: movl 12(%rbx), %eax
+; CHECK-AVX2-NEXT: movl %eax, 12(%r14)
+; CHECK-AVX2-NEXT: addq $8, %rsp
+; CHECK-AVX2-NEXT: popq %rbx
+; CHECK-AVX2-NEXT: popq %r12
+; CHECK-AVX2-NEXT: popq %r14
+; CHECK-AVX2-NEXT: popq %r15
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_limit_one_pred:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: pushq %r15
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16
+; CHECK-AVX512-NEXT: pushq %r14
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24
+; CHECK-AVX512-NEXT: pushq %r12
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32
+; CHECK-AVX512-NEXT: pushq %rbx
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40
+; CHECK-AVX512-NEXT: pushq %rax
+; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX512-NEXT: .cfi_offset %rbx, -40
+; CHECK-AVX512-NEXT: .cfi_offset %r12, -32
+; CHECK-AVX512-NEXT: .cfi_offset %r14, -24
+; CHECK-AVX512-NEXT: .cfi_offset %r15, -16
+; CHECK-AVX512-NEXT: movq %r8, %r12
+; CHECK-AVX512-NEXT: movq %rcx, %r15
+; CHECK-AVX512-NEXT: movq %rsi, %r14
+; CHECK-AVX512-NEXT: movq %rdi, %rbx
+; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi)
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB10_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl %edx, 4(%rbx)
+; CHECK-AVX512-NEXT: movq %rbx, %rdi
+; CHECK-AVX512-NEXT: callq bar
+; CHECK-AVX512-NEXT: .LBB10_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15)
+; CHECK-AVX512-NEXT: movq (%rbx), %rax
+; CHECK-AVX512-NEXT: movq %rax, (%r14)
+; CHECK-AVX512-NEXT: movl 8(%rbx), %eax
+; CHECK-AVX512-NEXT: movl %eax, 8(%r14)
+; CHECK-AVX512-NEXT: movl 12(%rbx), %eax
+; CHECK-AVX512-NEXT: movl %eax, 12(%r14)
+; CHECK-AVX512-NEXT: addq $8, %rsp
+; CHECK-AVX512-NEXT: popq %rbx
+; CHECK-AVX512-NEXT: popq %r12
+; CHECK-AVX512-NEXT: popq %r14
+; CHECK-AVX512-NEXT: popq %r15
+; CHECK-AVX512-NEXT: retq
+entry:
+ %d = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 3
+ store i32 %x2, i32* %d, align 4
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S, %struct.S* %s1, i64 0, i32 1
+ store i32 %x, i32* %b, align 4
+ tail call void @bar(%struct.S* nonnull %s1) #3
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S* %s3 to i8*
+ %1 = bitcast %struct.S* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 16, i32 4, i1 false)
+ %2 = bitcast %struct.S* %s2 to i8*
+ %3 = bitcast %struct.S* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 16, i32 4, i1 false)
+ ret void
+}
+
+
+declare void @bar(%struct.S*) local_unnamed_addr #1
+
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind uwtable "target-cpu"="x86-64" }
+
+%struct.S7 = type { float, float, float , float, float, float, float, float }
+
+; Function Attrs: nounwind uwtable
+define void @test_conditional_block_float(%struct.S7* nocapture noalias %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 {
+; CHECK-LABEL: test_conditional_block_float:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB11_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
+; CHECK-NEXT: .LBB11_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups 16(%r8), %xmm1
+; CHECK-NEXT: movups %xmm1, 16(%rcx)
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl 4(%rdi), %ecx
+; CHECK-NEXT: movq 8(%rdi), %rdx
+; CHECK-NEXT: movups 16(%rdi), %xmm0
+; CHECK-NEXT: movups %xmm0, 16(%rsi)
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movl %ecx, 4(%rsi)
+; CHECK-NEXT: movq %rdx, 8(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_conditional_block_float:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB11_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
+; DISABLED-NEXT: .LBB11_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups 16(%r8), %xmm1
+; DISABLED-NEXT: movups %xmm1, 16(%rcx)
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups 16(%rdi), %xmm1
+; DISABLED-NEXT: movups %xmm1, 16(%rsi)
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_conditional_block_float:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB11_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
+; CHECK-AVX2-NEXT: .LBB11_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
+; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, (%rsi)
+; CHECK-AVX2-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi)
+; CHECK-AVX2-NEXT: movq 24(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 24(%rsi)
+; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_conditional_block_float:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB11_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000
+; CHECK-AVX512-NEXT: .LBB11_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
+; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, (%rsi)
+; CHECK-AVX512-NEXT: movl 4(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 4(%rsi)
+; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi)
+; CHECK-AVX512-NEXT: movq 24(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 24(%rsi)
+; CHECK-AVX512-NEXT: vzeroupper
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S7, %struct.S7* %s1, i64 0, i32 1
+ store float 1.0, float* %b, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S7* %s3 to i8*
+ %1 = bitcast %struct.S7* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
+ %2 = bitcast %struct.S7* %s2 to i8*
+ %3 = bitcast %struct.S7* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
+ ret void
+}
+
+%struct.S8 = type { i64, i64, i64, i64, i64, i64 }
+
+; Function Attrs: nounwind uwtable
+define void @test_conditional_block_ymm(%struct.S8* nocapture noalias %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 {
+; CHECK-LABEL: test_conditional_block_ymm:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $18, %edx
+; CHECK-NEXT: jl .LBB12_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movq $1, 8(%rdi)
+; CHECK-NEXT: .LBB12_2: # %if.end
+; CHECK-NEXT: movups (%r8), %xmm0
+; CHECK-NEXT: movups 16(%r8), %xmm1
+; CHECK-NEXT: movups %xmm1, 16(%rcx)
+; CHECK-NEXT: movups %xmm0, (%rcx)
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: movq 8(%rdi), %rcx
+; CHECK-NEXT: movups 16(%rdi), %xmm0
+; CHECK-NEXT: movups %xmm0, 16(%rsi)
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: movq %rcx, 8(%rsi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_conditional_block_ymm:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: cmpl $18, %edx
+; DISABLED-NEXT: jl .LBB12_2
+; DISABLED-NEXT: # %bb.1: # %if.then
+; DISABLED-NEXT: movq $1, 8(%rdi)
+; DISABLED-NEXT: .LBB12_2: # %if.end
+; DISABLED-NEXT: movups (%r8), %xmm0
+; DISABLED-NEXT: movups 16(%r8), %xmm1
+; DISABLED-NEXT: movups %xmm1, 16(%rcx)
+; DISABLED-NEXT: movups %xmm0, (%rcx)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups 16(%rdi), %xmm1
+; DISABLED-NEXT: movups %xmm1, 16(%rsi)
+; DISABLED-NEXT: movups %xmm0, (%rsi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_conditional_block_ymm:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: cmpl $18, %edx
+; CHECK-AVX2-NEXT: jl .LBB12_2
+; CHECK-AVX2-NEXT: # %bb.1: # %if.then
+; CHECK-AVX2-NEXT: movq $1, 8(%rdi)
+; CHECK-AVX2-NEXT: .LBB12_2: # %if.end
+; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0
+; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx)
+; CHECK-AVX2-NEXT: movq (%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, (%rsi)
+; CHECK-AVX2-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi)
+; CHECK-AVX2-NEXT: vzeroupper
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_conditional_block_ymm:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: cmpl $18, %edx
+; CHECK-AVX512-NEXT: jl .LBB12_2
+; CHECK-AVX512-NEXT: # %bb.1: # %if.then
+; CHECK-AVX512-NEXT: movq $1, 8(%rdi)
+; CHECK-AVX512-NEXT: .LBB12_2: # %if.end
+; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0
+; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx)
+; CHECK-AVX512-NEXT: movq (%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, (%rsi)
+; CHECK-AVX512-NEXT: movq 8(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 8(%rsi)
+; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi)
+; CHECK-AVX512-NEXT: vzeroupper
+; CHECK-AVX512-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %x, 17
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %b = getelementptr inbounds %struct.S8, %struct.S8* %s1, i64 0, i32 1
+ store i64 1, i64* %b, align 4
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %0 = bitcast %struct.S8* %s3 to i8*
+ %1 = bitcast %struct.S8* %s4 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 32, i32 4, i1 false)
+ %2 = bitcast %struct.S8* %s2 to i8*
+ %3 = bitcast %struct.S8* %s1 to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false)
+ ret void
+}
+
+define dso_local void @test_alias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
+; CHECK-LABEL: test_alias:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl %esi, (%rdi)
+; CHECK-NEXT: movups (%rdi), %xmm0
+; CHECK-NEXT: movups %xmm0, 4(%rdi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_alias:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl %esi, (%rdi)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, 4(%rdi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_alias:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl %esi, (%rdi)
+; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0
+; CHECK-AVX2-NEXT: vmovups %xmm0, 4(%rdi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_alias:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl %esi, (%rdi)
+; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0
+; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%rdi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %a = bitcast i8* %A to i32*
+ store i32 %x, i32* %a, align 4
+ %add.ptr = getelementptr inbounds i8, i8* %A, i64 4
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind uwtable
+define dso_local void @test_noalias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 {
+; CHECK-LABEL: test_noalias:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl %esi, (%rdi)
+; CHECK-NEXT: movl (%rdi), %eax
+; CHECK-NEXT: movl %eax, 20(%rdi)
+; CHECK-NEXT: movq 4(%rdi), %rax
+; CHECK-NEXT: movq %rax, 24(%rdi)
+; CHECK-NEXT: movl 12(%rdi), %eax
+; CHECK-NEXT: movl %eax, 32(%rdi)
+; CHECK-NEXT: retq
+;
+; DISABLED-LABEL: test_noalias:
+; DISABLED: # %bb.0: # %entry
+; DISABLED-NEXT: movl %esi, (%rdi)
+; DISABLED-NEXT: movups (%rdi), %xmm0
+; DISABLED-NEXT: movups %xmm0, 20(%rdi)
+; DISABLED-NEXT: retq
+;
+; CHECK-AVX2-LABEL: test_noalias:
+; CHECK-AVX2: # %bb.0: # %entry
+; CHECK-AVX2-NEXT: movl %esi, (%rdi)
+; CHECK-AVX2-NEXT: movl (%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 20(%rdi)
+; CHECK-AVX2-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX2-NEXT: movq %rax, 24(%rdi)
+; CHECK-AVX2-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX2-NEXT: movl %eax, 32(%rdi)
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test_noalias:
+; CHECK-AVX512: # %bb.0: # %entry
+; CHECK-AVX512-NEXT: movl %esi, (%rdi)
+; CHECK-AVX512-NEXT: movl (%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 20(%rdi)
+; CHECK-AVX512-NEXT: movq 4(%rdi), %rax
+; CHECK-AVX512-NEXT: movq %rax, 24(%rdi)
+; CHECK-AVX512-NEXT: movl 12(%rdi), %eax
+; CHECK-AVX512-NEXT: movl %eax, 32(%rdi)
+; CHECK-AVX512-NEXT: retq
+entry:
+ %a = bitcast i8* %A to i32*
+ store i32 %x, i32* %a, align 4
+ %add.ptr = getelementptr inbounds i8, i8* %A, i64 20
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false)
+ ret void
+}
+
+
+