[AVX512] Implement masked and 512-bit pshufd intrinsics directly with __builtin_shufflevector and __builtin_ia32_select.

llvm-svn: 272467
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index b1f0ebf..90b87b2 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -9052,19 +9052,34 @@
 }
 
 #define _mm512_shuffle_epi32(A, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_pshufd512_mask((__v16si)(__m512i)(A), (int)(I), \
-                                         (__v16si)_mm512_undefined_epi32(), \
-                                         (__mmask16)-1); })
+  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
+                                   (__v16si)_mm512_setzero_si512(), \
+                                   0  + (((I) & 0x03) >> 0), \
+                                   0  + (((I) & 0x0c) >> 2), \
+                                   0  + (((I) & 0x30) >> 4), \
+                                   0  + (((I) & 0xc0) >> 6), \
+                                   4  + (((I) & 0x03) >> 0), \
+                                   4  + (((I) & 0x0c) >> 2), \
+                                   4  + (((I) & 0x30) >> 4), \
+                                   4  + (((I) & 0xc0) >> 6), \
+                                   8  + (((I) & 0x03) >> 0), \
+                                   8  + (((I) & 0x0c) >> 2), \
+                                   8  + (((I) & 0x30) >> 4), \
+                                   8  + (((I) & 0xc0) >> 6), \
+                                   12 + (((I) & 0x03) >> 0), \
+                                   12 + (((I) & 0x0c) >> 2), \
+                                   12 + (((I) & 0x30) >> 4), \
+                                   12 + (((I) & 0xc0) >> 6)); })
 
 #define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_pshufd512_mask((__v16si)(__m512i)(A), (int)(I), \
-                                         (__v16si)(__m512i)(W), \
-                                         (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)(__m512i)(W)); })
 
 #define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_pshufd512_mask((__v16si)(__m512i)(A), (int)(I), \
-                                         (__v16si)_mm512_setzero_si512(), \
-                                         (__mmask16)(U)); })
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                      (__v16si)_mm512_setzero_si512()); })
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 62e5e86..830be8d 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -9257,25 +9257,24 @@
 }
 
 #define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\
-  (__m256i)__builtin_ia32_pshufd256_mask((__v8si)(__m256i)(A), (int)(I), \
-                                         (__v8si)(__m256i)(W), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)(__m256i)(W)); })
 
 #define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\
-  (__m256i)__builtin_ia32_pshufd256_mask((__v8si)(__m256i)(A), (int)(I), \
-                                         (__v8si)_mm256_setzero_si256(), \
-                                         (__mmask8)(U)); })
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                      (__v8si)_mm256_setzero_si256()); })
 
 #define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\
-  (__m128i)__builtin_ia32_pshufd128_mask((__v4si)(__m128i)(A), (int)(I), \
-                                         (__v4si)(__m128i)(W), \
-                                         (__mmask8)(U)); })
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)(__m128i)(W)); })
 
 #define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\
-  (__m128i)__builtin_ia32_pshufd128_mask((__v4si)(__m128i)(A), (int)(I), \
-                                         (__v4si)_mm_setzero_si128(), \
-                                         (__mmask8)(U)); })
-
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                      (__v4si)_mm_setzero_si128()); })
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)