[X86] Use __builtin_convertvector to replace some of the avx512 truncate builtins.
As long as the destination type is a 256 or 128 bit vector with the same number of elements we can use __builtin_convertvector to directly generate trunc IR instruction which will be handled natively by the backend.
Differential Revision: https://reviews.llvm.org/D46742
llvm-svn: 332266
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index f5ff5d3..499bb8a 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -1157,23 +1157,21 @@
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_cvtepi16_epi8 (__m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- (__mmask32) -1);
+ return (__m256i)__builtin_convertvector((__v32hi)__A, __v32qi);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) __O,
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm512_cvtepi16_epi8(__A),
+ (__v32qi)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- __M);
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm512_cvtepi16_epi8(__A),
+ (__v32qi)_mm256_setzero_si256());
}
static __inline__ void __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index e891b6c..24a4e9f 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -7601,24 +7601,23 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_cvtepi32_epi8 (__m512i __A)
{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask16) -1);
+ return (__m128i)__builtin_convertvector((__v16si)__A, __v16qi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) __O, __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask8)__M,
+ (__v16qi)_mm512_cvtepi32_epi8(__A),
+ (__v16qi)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask8)__M,
+ (__v16qi)_mm512_cvtepi32_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ void __DEFAULT_FN_ATTRS
@@ -7630,24 +7629,23 @@
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_cvtepi32_epi16 (__m512i __A)
{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_undefined_si256 (),
- (__mmask16) -1);
+ return (__m256i)__builtin_convertvector((__v16si)__A, __v16hi);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) __O, __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm512_cvtepi32_epi16(__A),
+ (__v16hi)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm512_cvtepi32_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
}
static __inline__ void __DEFAULT_FN_ATTRS
@@ -7688,24 +7686,23 @@
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_cvtepi64_epi32 (__m512i __A)
{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) _mm256_undefined_si256 (),
- (__mmask8) -1);
+ return (__m256i)__builtin_convertvector((__v8di) __A, __v8si);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) __O, __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm512_cvtepi64_epi32(__A),
+ (__v8si)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm512_cvtepi64_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ void __DEFAULT_FN_ATTRS
@@ -7717,24 +7714,23 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_cvtepi64_epi16 (__m512i __A)
{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) _mm_undefined_si128 (),
- (__mmask8) -1);
+ return (__m128i)__builtin_convertvector((__v8di)__A, __v8hi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) __O, __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm512_cvtepi64_epi16(__A),
+ (__v8hi)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm512_cvtepi64_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ void __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index e940e2b..6c5131c 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1551,23 +1551,21 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_cvtepi16_epi8 (__m256i __A) {
- return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- (__mmask16) -1);
+ return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
- (__v16qi) __O,
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask8)__M,
+ (__v16qi)_mm256_cvtepi16_epi8(__A),
+ (__v16qi)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
+ return (__m128i)__builtin_ia32_selectb_128((__mmask8)__M,
+ (__v16qi)_mm256_cvtepi16_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
}
static __inline__ void __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 2581c05..0742995 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7627,24 +7627,23 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_cvtepi32_epi16 (__m256i __A)
{
- return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
- (__v8hi)_mm_setzero_si128 (),
- (__mmask8) -1);
+ return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
{
- return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
- (__v8hi) __O, __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm256_cvtepi32_epi16(__A),
+ (__v8hi)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
{
- return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm256_cvtepi32_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
}
static __inline__ void __DEFAULT_FN_ATTRS
@@ -7743,24 +7742,23 @@
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_cvtepi64_epi32 (__m256i __A)
{
- return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
- (__v4si) _mm_undefined_si128(),
- (__mmask8) -1);
+ return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
{
- return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
- (__v4si) __O, __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm256_cvtepi64_epi32(__A),
+ (__v4si)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
{
- return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm256_cvtepi64_epi32(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ void __DEFAULT_FN_ATTRS