[X86] Fold masking into subvector extract builtins.
I'm looking into making the select builtins require avx512f, avx512bw, or avx512vl since masking operations generally require those features.
The extract builtins are funny because the 512-bit versions return a 128 or 256 bit vector with masking even when avx512vl is not supported.
llvm-svn: 334330
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ac9f46c..012428d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9239,18 +9239,18 @@
case X86::BI__builtin_ia32_vextractf128_ps256:
case X86::BI__builtin_ia32_vextractf128_si256:
case X86::BI__builtin_ia32_extract128i256:
- case X86::BI__builtin_ia32_extractf64x4:
- case X86::BI__builtin_ia32_extractf32x4:
- case X86::BI__builtin_ia32_extracti64x4:
- case X86::BI__builtin_ia32_extracti32x4:
- case X86::BI__builtin_ia32_extractf32x8:
- case X86::BI__builtin_ia32_extracti32x8:
- case X86::BI__builtin_ia32_extractf32x4_256:
- case X86::BI__builtin_ia32_extracti32x4_256:
- case X86::BI__builtin_ia32_extractf64x2_256:
- case X86::BI__builtin_ia32_extracti64x2_256:
- case X86::BI__builtin_ia32_extractf64x2_512:
- case X86::BI__builtin_ia32_extracti64x2_512: {
+ case X86::BI__builtin_ia32_extractf64x4_mask:
+ case X86::BI__builtin_ia32_extractf32x4_mask:
+ case X86::BI__builtin_ia32_extracti64x4_mask:
+ case X86::BI__builtin_ia32_extracti32x4_mask:
+ case X86::BI__builtin_ia32_extractf32x8_mask:
+ case X86::BI__builtin_ia32_extracti32x8_mask:
+ case X86::BI__builtin_ia32_extractf32x4_256_mask:
+ case X86::BI__builtin_ia32_extracti32x4_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_256_mask:
+ case X86::BI__builtin_ia32_extracti64x2_256_mask:
+ case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extracti64x2_512_mask: {
llvm::Type *DstTy = ConvertType(E->getType());
unsigned NumElts = DstTy->getVectorNumElements();
unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue() * NumElts;
@@ -9259,10 +9259,15 @@
for (unsigned i = 0; i != NumElts; ++i)
Indices[i] = i + Index;
- return Builder.CreateShuffleVector(Ops[0],
- UndefValue::get(Ops[0]->getType()),
- makeArrayRef(Indices, NumElts),
- "extract");
+ Value *Res = Builder.CreateShuffleVector(Ops[0],
+ UndefValue::get(Ops[0]->getType()),
+ makeArrayRef(Indices, NumElts),
+ "extract");
+
+ if (Ops.size() == 4)
+ Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
+
+ return Res;
}
case X86::BI__builtin_ia32_vinsertf128_pd256:
case X86::BI__builtin_ia32_vinsertf128_ps256:
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index fbb4bbc..76ce32f 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1103,56 +1103,70 @@
}
#define _mm512_extractf32x8_ps(A, imm) \
- (__m256)__builtin_ia32_extractf32x8((__v16sf)(__m512)(A), (int)(imm))
+ (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)_mm256_undefined_ps(), \
+ (__mmask8)-1);
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
- (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
- (__v8sf)(__m256)(W))
+ (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
- (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
- (__v8sf)_mm256_setzero_ps())
+ (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U))
#define _mm512_extractf64x2_pd(A, imm) \
- (__m128d)__builtin_ia32_extractf64x2_512((__v8df)(__m512d)(A), (int)(imm))
+ (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)-1)
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
- (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
- (__v2df)(__m128d)(W))
+ (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
- (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm512_extractf64x2_pd((A), (imm)), \
- (__v2df)_mm_setzero_pd())
+ (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U))
#define _mm512_extracti32x8_epi32(A, imm) \
- (__m256i)__builtin_ia32_extracti32x8((__v16si)(__m512i)(A), (int)(imm))
+ (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)_mm256_undefined_si256(), \
+ (__mmask8)-1)
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
- (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
- (__v8si)(__m256i)(W))
+ (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
- (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
- (__v8si)_mm256_setzero_si256())
+ (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U))
#define _mm512_extracti64x2_epi64(A, imm) \
- (__m128i)__builtin_ia32_extracti64x2_512((__v8di)(__m512i)(A), (int)(imm))
+ (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_undefined_si128(), \
+ (__mmask8)-1)
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
- (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
- (__v2di)(__m128i)(W))
+ (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
- (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
- (__v2di)_mm_setzero_si128())
+ (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_setzero_si128(), \
+ (__mmask8)(U))
#define _mm512_insertf32x8(A, B, imm) \
(__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 381011e..dbac414 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3494,30 +3494,34 @@
/* Vector Extract */
#define _mm512_extractf64x4_pd(A, I) \
- (__m256d)__builtin_ia32_extractf64x4((__v8df)(__m512d)(A), (int)(I))
+ (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
+ (__v4df)_mm256_undefined_si256(), \
+ (__mmask8)-1)
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
- (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
- (__v4df)(__m256d)(W))
+ (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
- (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm512_extractf64x4_pd((A), (imm)), \
- (__v4df)_mm256_setzero_pd())
+ (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U))
#define _mm512_extractf32x4_ps(A, I) \
- (__m128)__builtin_ia32_extractf32x4((__v16sf)(__m512)(A), (int)(I))
+ (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)-1)
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
- (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
- (__v4sf)(__m128)(W))
+ (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
- (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
- (__v4sf)_mm_setzero_ps())
+ (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U))
/* Vector Blend */
@@ -7534,30 +7538,34 @@
}
#define _mm512_extracti32x4_epi32(A, imm) \
- (__m128i)__builtin_ia32_extracti32x4((__v16si)(__m512i)(A), (int)(imm))
+ (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)_mm_undefined_si128(), \
+ (__mmask8)-1)
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
- (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
- (__v4si)(__m128i)(W))
+ (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
- (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
- (__v4si)_mm_setzero_si128())
+ (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)_mm_setzero_si128(), \
+ (__mmask8)(U))
#define _mm512_extracti64x4_epi64(A, imm) \
- (__m256i)__builtin_ia32_extracti64x4((__v8di)(__m512i)(A), (int)(imm))
+ (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)_mm256_undefined_si256(), \
+ (__mmask8)-1)
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
- (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
- (__v4di)(__m256i)(W))
+ (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U))
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
- (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
- (__v4di)_mm256_setzero_si256())
+ (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)_mm256_setzero_si256(), \
+ (__mmask8)(U))
#define _mm512_insertf64x4(A, B, imm) \
(__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 8d6ff3e..62c0fd7 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -1083,30 +1083,40 @@
}
#define _mm256_extractf64x2_pd(A, imm) \
- (__m128d)__builtin_ia32_extractf64x2_256((__v4df)(__m256d)(A), (int)(imm))
+ (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)-1)
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
- (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
- (__v2df)(__m128d)(W))
+ (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U))
#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
- (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
- (__v2df)_mm_setzero_pd())
+ (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U))
#define _mm256_extracti64x2_epi64(A, imm) \
- (__m128i)__builtin_ia32_extracti64x2_256((__v4di)(__m256i)(A), (int)(imm))
+ (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_undefined_si128(), \
+ (__mmask8)-1)
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
- (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
- (__v2di)(__m128i)(W))
+ (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U))
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
- (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
- (__v2di)_mm_setzero_si128())
+ (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_setzero_si128(), \
+ (__mmask8)(U))
#define _mm256_insertf64x2(A, B, imm) \
(__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 22a1fcb..fea88e2 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7699,30 +7699,40 @@
}
#define _mm256_extractf32x4_ps(A, imm) \
- (__m128)__builtin_ia32_extractf32x4_256((__v8sf)(__m256)(A), (int)(imm))
+ (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)-1)
#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
- (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
- (__v4sf)(__m128)(W))
+ (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U))
#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
- (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
- (__v4sf)_mm_setzero_ps())
+ (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U))
#define _mm256_extracti32x4_epi32(A, imm) \
- (__m128i)__builtin_ia32_extracti32x4_256((__v8si)(__m256i)(A), (int)(imm))
+ (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)_mm_undefined_si128(), \
+ (__mmask8)-1)
#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
- (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
- (__v4si)(__m128i)(W))
+ (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U))
#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
- (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
- (__v4si)_mm_setzero_si128())
+ (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)_mm_setzero_si128(), \
+ (__mmask8)(U))
#define _mm256_insertf32x4(A, B, imm) \
(__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 3e0d36b..ae9c2f0 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2611,14 +2611,14 @@
case X86::BI__builtin_ia32_vextractf128_ps256:
case X86::BI__builtin_ia32_vextractf128_si256:
case X86::BI__builtin_ia32_extract128i256:
- case X86::BI__builtin_ia32_extractf64x4:
- case X86::BI__builtin_ia32_extracti64x4:
- case X86::BI__builtin_ia32_extractf32x8:
- case X86::BI__builtin_ia32_extracti32x8:
- case X86::BI__builtin_ia32_extractf64x2_256:
- case X86::BI__builtin_ia32_extracti64x2_256:
- case X86::BI__builtin_ia32_extractf32x4_256:
- case X86::BI__builtin_ia32_extracti32x4_256:
+ case X86::BI__builtin_ia32_extractf64x4_mask:
+ case X86::BI__builtin_ia32_extracti64x4_mask:
+ case X86::BI__builtin_ia32_extractf32x8_mask:
+ case X86::BI__builtin_ia32_extracti32x8_mask:
+ case X86::BI__builtin_ia32_extractf64x2_256_mask:
+ case X86::BI__builtin_ia32_extracti64x2_256_mask:
+ case X86::BI__builtin_ia32_extractf32x4_256_mask:
+ case X86::BI__builtin_ia32_extracti32x4_256_mask:
i = 1; l = 0; u = 1;
break;
case X86::BI__builtin_ia32_vec_set_v2di:
@@ -2641,10 +2641,10 @@
case X86::BI__builtin_ia32_vec_ext_v4si:
case X86::BI__builtin_ia32_vec_ext_v4sf:
case X86::BI__builtin_ia32_vec_ext_v4di:
- case X86::BI__builtin_ia32_extractf32x4:
- case X86::BI__builtin_ia32_extracti32x4:
- case X86::BI__builtin_ia32_extractf64x2_512:
- case X86::BI__builtin_ia32_extracti64x2_512:
+ case X86::BI__builtin_ia32_extractf32x4_mask:
+ case X86::BI__builtin_ia32_extracti32x4_mask:
+ case X86::BI__builtin_ia32_extractf64x2_512_mask:
+ case X86::BI__builtin_ia32_extracti64x2_512_mask:
i = 1; l = 0; u = 3;
break;
case X86::BI_mm_prefetch: