[X86] Stop implicitly enabling avx512vl when avx512bf16 is enabled.

Previously we were doing this so that the 256 bit selectw builtin could be used in the implementation of the 512->256 bit conversion intrinsic.

After this commit we now use a masked convert builtin that will emit the intrinsic call and the 256-bit select from custom code in CGBuiltin. Then the header only needs to call that one intrinsic.

llvm-svn: 360924
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index b83c938..44f5fbf 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -661,14 +661,12 @@
     if ((Name.startswith("avx512vbmi") || Name == "avx512bitalg") && Enabled)
       Features["avx512bw"] = true;
     if (Name == "avx512bf16" && Enabled)
-      Features["avx512bw"] = Features["avx512vl"] = true;
+      Features["avx512bw"] = true;
     // Also disable VBMI/VBMI2/BITALG if BWI is being disabled.
     if (Name == "avx512bw" && !Enabled)
       Features["avx512vbmi"] = Features["avx512vbmi2"] =
       Features["avx512bf16"] =
       Features["avx512bitalg"] = false;
-    if (Name == "avx512vl" && !Enabled)
-      Features["avx512bf16"] = false;
   } else if (Name == "fma") {
     if (Enabled)
       setSSELevel(Features, AVX, Enabled);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 8e70720..4396357 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -11885,6 +11885,22 @@
     return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
   }
 
+  case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
+  case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    default: llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
+      IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
+      break;
+    case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
+      IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
+      break;
+    }
+    Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
+    return EmitX86Select(*this, Ops[2], Res, Ops[1]);
+  }
+
   case X86::BI__emul:
   case X86::BI__emulu: {
     llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 1a3d046..b3b5250 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -95,7 +95,9 @@
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_cvtneps_pbh(__m512 __A) {
-  return (__m256bh)__builtin_ia32_cvtneps2bf16_512((__v16sf) __A);
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                              (__v16hi)_mm256_undefined_si256(),
+                                              (__mmask16)-1);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -114,9 +116,9 @@
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                              (__v16hi)_mm512_cvtneps_pbh(__A),
-                                              (__v16hi)__W);
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                        (__v16hi)__W,
+                                                        (__mmask16)__U);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -133,9 +135,9 @@
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                              (__v16hi)_mm512_cvtneps_pbh(__A),
-                                              (__v16hi)_mm256_setzero_si256());
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                (__v16hi)_mm256_setzero_si256(),
+                                                (__mmask16)__U);
 }
 
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index f5da42b..5a56880 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -220,7 +220,9 @@
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_cvtneps_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256((__v8sf)__A);
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                  (__v8hi)_mm_undefined_si128(),
+                                                  (__mmask8)-1);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -239,9 +241,9 @@
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                              (__v8hi)_mm256_cvtneps_pbh(__A),
-                                              (__v8hi)__W);
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                        (__v8hi)__W,
+                                                        (__mmask8)__U);
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -258,9 +260,9 @@
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                              (__v8hi)_mm256_cvtneps_pbh(__A),
-                                              (__v8hi)_mm_setzero_si128());
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__mmask8)__U);
 }
 
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.