[X86] Implement old kunpck intrinsics using vector ops on vXi1 instead of integer shift/and/or
Summary:
kunpck intrinsics were removed in favor of native IR a few months ago. The implementation lowers them as by operation on the integer types passed to the intrinsic and then just shifting, masking, and oring them together. A special X86 DAG combine was added to recognize this patter and turn it into a concat_vector operation.
I think it makes more sense to keep the IR implementation closer to vector operations on vXi1. Given that we expect these builtins to be used around other builtins that operate on k-registers which we try to represent in IR with vXi1. InstCombine should be able to get rid of the bitcasts between integers and vXi1 leaving only the vector operations.
Reviewers: RKSimon, spatel, zvi, jina.nahias
Reviewed By: RKSimon
Subscribers: cfe-commits
Differential Revision: https://reviews.llvm.org/D42016
llvm-svn: 322461
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9ffc7de..51a3cea 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8456,6 +8456,28 @@
Builder.getInt16Ty());
}
+ case X86::BI__builtin_ia32_kunpckdi:
+ case X86::BI__builtin_ia32_kunpcksi:
+ case X86::BI__builtin_ia32_kunpckhi: {
+ unsigned NumElts = Ops[0]->getType()->getScalarSizeInBits();
+ Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
+ Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
+ uint32_t Indices[64];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i;
+
+ // First extract half of each vector. This gives better codegen than
+ // doing it in a single shuffle.
+ LHS = Builder.CreateShuffleVector(LHS, LHS,
+ makeArrayRef(Indices, NumElts / 2));
+ RHS = Builder.CreateShuffleVector(RHS, RHS,
+ makeArrayRef(Indices, NumElts / 2));
+ // Concat the vectors.
+ Value *Res = Builder.CreateShuffleVector(LHS, RHS,
+ makeArrayRef(Indices, NumElts));
+ return Builder.CreateBitCast(Res, Ops[0]->getType());
+ }
+
case X86::BI__builtin_ia32_vplzcntd_128_mask:
case X86::BI__builtin_ia32_vplzcntd_256_mask:
case X86::BI__builtin_ia32_vplzcntd_512_mask: