[X86] Add subvector insert and extract builtins to enable target feature checking and immediate range checking.

Test changes are due to differences in how we generate undef elements now. We also changed the types used for extractf128_si256/insertf128_si256 to match the signature of the builtin that previously existed which this patch resurrects. This also matches gcc.

llvm-svn: 334261
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 7cc5e60..c63289e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -491,6 +491,9 @@
 TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIc", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "nc", "avx")
@@ -503,6 +506,9 @@
 TARGET_BUILTIN(__builtin_ia32_vpermilps, "V4fV4fIi", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_vpermilpd256, "V4dV4dIi", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_vpermilps256, "V8fV8fIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vinsertf128_pd256, "V4dV4dV2dIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vinsertf128_ps256, "V8fV8fV4fIc", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_vinsertf128_si256, "V8iV8iV4iIc", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "nc", "avx")
@@ -618,6 +624,8 @@
 TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_extract128i256, "V2LLiV4LLiIc", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_insert128i256, "V4LLiV4LLiV2LLiIc", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "n", "avx2")
 TARGET_BUILTIN(__builtin_ia32_maskloadq256, "V4LLiV4LLiC*V4LLi", "n", "avx2")
 TARGET_BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "n", "avx2")
@@ -927,6 +935,8 @@
 TARGET_BUILTIN(__builtin_ia32_alignd256, "V8iV8iV8iIi", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_alignq128, "V2LLiV2LLiV2LLiIi", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_alignq256, "V4LLiV4LLiV4LLiIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extractf64x4, "V4dV8dIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_extractf32x4, "V4fV16fIi", "nc", "avx512f")
 
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "nc", "avx512vl,avx512vnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "nc", "avx512vl,avx512vnni")
@@ -1646,6 +1656,28 @@
 TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2LLiUc", "n", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4LLiV8sUc", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4LLiUc", "n", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extractf32x8, "V8fV16fIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extractf64x2_512, "V2dV8dIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti32x8, "V8iV16iIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti64x2_512, "V2LLiV8LLiIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_extracti32x4, "V4iV16iIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_extracti64x4, "V4LLiV8LLiIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_extractf64x2_256, "V2dV4dIi", "nc", "avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extracti64x2_256, "V2LLiV4LLiIi", "nc", "avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extractf32x4_256, "V4fV8fIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_extracti32x4_256, "V4iV8iIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x8, "V16fV16fV8fIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_insertf64x2_512, "V8dV8dV2dIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_inserti32x8, "V16iV16iV8iIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_inserti64x2_512, "V8LLiV8LLiV2LLiIi", "nc", "avx512dq")
+TARGET_BUILTIN(__builtin_ia32_insertf64x4, "V8dV8dV4dIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_inserti64x4, "V8LLiV8LLiV4LLiIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_insertf64x2_256, "V4dV4dV2dIi", "nc", "avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_inserti64x2_256, "V4LLiV4LLiV2LLiIi", "nc", "avx512dq,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x4_256, "V8fV8fV4fIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_inserti32x4_256, "V8iV8iV4iIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_insertf32x4, "V16fV16fV4fIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_inserti32x4, "V16iV16iV4iIi", "nc", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2diV2dUc", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4diV4dUc", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fiV4fUc", "nc", "avx512vl")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4331005..025b34e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9235,6 +9235,75 @@
     Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_extractf64x4:
+  case X86::BI__builtin_ia32_extractf32x4:
+  case X86::BI__builtin_ia32_extracti64x4:
+  case X86::BI__builtin_ia32_extracti32x4:
+  case X86::BI__builtin_ia32_extractf32x8:
+  case X86::BI__builtin_ia32_extracti32x8:
+  case X86::BI__builtin_ia32_extractf32x4_256:
+  case X86::BI__builtin_ia32_extracti32x4_256:
+  case X86::BI__builtin_ia32_extractf64x2_256:
+  case X86::BI__builtin_ia32_extracti64x2_256:
+  case X86::BI__builtin_ia32_extractf64x2_512:
+  case X86::BI__builtin_ia32_extracti64x2_512: {
+    llvm::Type *DstTy = ConvertType(E->getType());
+    unsigned NumElts = DstTy->getVectorNumElements();
+    unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue() * NumElts;
+
+    uint32_t Indices[16];
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = i + Index;
+
+    return Builder.CreateShuffleVector(Ops[0],
+                                       UndefValue::get(Ops[0]->getType()),
+                                       makeArrayRef(Indices, NumElts),
+                                       "extract");
+  }
+  case X86::BI__builtin_ia32_vinsertf128_pd256:
+  case X86::BI__builtin_ia32_vinsertf128_ps256:
+  case X86::BI__builtin_ia32_vinsertf128_si256:
+  case X86::BI__builtin_ia32_insert128i256:
+  case X86::BI__builtin_ia32_insertf64x4:
+  case X86::BI__builtin_ia32_insertf32x4:
+  case X86::BI__builtin_ia32_inserti64x4:
+  case X86::BI__builtin_ia32_inserti32x4:
+  case X86::BI__builtin_ia32_insertf32x8:
+  case X86::BI__builtin_ia32_inserti32x8:
+  case X86::BI__builtin_ia32_insertf32x4_256:
+  case X86::BI__builtin_ia32_inserti32x4_256:
+  case X86::BI__builtin_ia32_insertf64x2_256:
+  case X86::BI__builtin_ia32_inserti64x2_256:
+  case X86::BI__builtin_ia32_insertf64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512: {
+    unsigned DstNumElts = Ops[0]->getType()->getVectorNumElements();
+    unsigned SrcNumElts = Ops[1]->getType()->getVectorNumElements();
+    unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue() * SrcNumElts;
+
+    uint32_t Indices[16];
+    for (unsigned i = 0; i != DstNumElts; ++i)
+      Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
+
+    Value *Op1 = Builder.CreateShuffleVector(Ops[1],
+                                             UndefValue::get(Ops[1]->getType()),
+                                             makeArrayRef(Indices, DstNumElts),
+                                             "widen");
+
+    for (unsigned i = 0; i != DstNumElts; ++i) {
+      if (i >= Index && i < (Index + SrcNumElts))
+        Indices[i] = (i - Index) + DstNumElts;
+      else
+        Indices[i] = i;
+    }
+
+    return Builder.CreateShuffleVector(Ops[0], Op1,
+                                       makeArrayRef(Indices, DstNumElts),
+                                       "insert");
+  }
   case X86::BI__builtin_ia32_pblendw128:
   case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_blendps:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index d1c5306..3867af0 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -881,18 +881,11 @@
   (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M))
 
 #define _mm256_extracti128_si256(V, M) \
-  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
-                                   (__v4di)_mm256_undefined_si256(), \
-                                   (((M) & 1) ? 2 : 0), \
-                                   (((M) & 1) ? 3 : 1) )
+  (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
 
 #define _mm256_inserti128_si256(V1, V2, M) \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
-                                   (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
-                                   (((M) & 1) ? 0 : 4), \
-                                   (((M) & 1) ? 1 : 5), \
-                                   (((M) & 1) ? 4 : 2), \
-                                   (((M) & 1) ? 5 : 3) )
+  (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+                                        (__v2di)(__m128i)(V2), (int)(M))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskload_epi32(int const *__X, __m256i __M)
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index 9ef7804..fbb4bbc 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1103,16 +1103,7 @@
 }
 
 #define _mm512_extractf32x8_ps(A, imm) \
-  (__m256)__builtin_shufflevector((__v16sf)(__m512)(A),           \
-                                  (__v16sf)_mm512_undefined_ps(), \
-                                  ((imm) & 1) ?  8 : 0,           \
-                                  ((imm) & 1) ?  9 : 1,           \
-                                  ((imm) & 1) ? 10 : 2,           \
-                                  ((imm) & 1) ? 11 : 3,           \
-                                  ((imm) & 1) ? 12 : 4,           \
-                                  ((imm) & 1) ? 13 : 5,           \
-                                  ((imm) & 1) ? 14 : 6,           \
-                                  ((imm) & 1) ? 15 : 7)
+  (__m256)__builtin_ia32_extractf32x8((__v16sf)(__m512)(A), (int)(imm))
 
 #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
@@ -1125,10 +1116,7 @@
                                    (__v8sf)_mm256_setzero_ps())
 
 #define _mm512_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
-                                   (__v8df)_mm512_undefined_pd(), \
-                                   0 + ((imm) & 0x3) * 2,         \
-                                   1 + ((imm) & 0x3) * 2)
+  (__m128d)__builtin_ia32_extractf64x2_512((__v8df)(__m512d)(A), (int)(imm))
 
 #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
@@ -1141,16 +1129,7 @@
                                    (__v2df)_mm_setzero_pd())
 
 #define _mm512_extracti32x8_epi32(A, imm) \
-  (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
-                                   (__v16si)_mm512_undefined_epi32(), \
-                                   ((imm) & 1) ?  8 : 0,              \
-                                   ((imm) & 1) ?  9 : 1,              \
-                                   ((imm) & 1) ? 10 : 2,              \
-                                   ((imm) & 1) ? 11 : 3,              \
-                                   ((imm) & 1) ? 12 : 4,              \
-                                   ((imm) & 1) ? 13 : 5,              \
-                                   ((imm) & 1) ? 14 : 6,              \
-                                   ((imm) & 1) ? 15 : 7)
+  (__m256i)__builtin_ia32_extracti32x8((__v16si)(__m512i)(A), (int)(imm))
 
 #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
   (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
@@ -1163,10 +1142,7 @@
                                 (__v8si)_mm256_setzero_si256())
 
 #define _mm512_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A),          \
-                                   (__v8di)_mm512_undefined_epi32(), \
-                                   0 + ((imm) & 0x3) * 2,           \
-                                   1 + ((imm) & 0x3) * 2)
+  (__m128i)__builtin_ia32_extracti64x2_512((__v8di)(__m512i)(A), (int)(imm))
 
 #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
   (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
@@ -1179,24 +1155,8 @@
                                 (__v2di)_mm_setzero_si128())
 
 #define _mm512_insertf32x8(A, B, imm) \
-  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
-                                  (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
-                                  ((imm) & 0x1) ?  0 : 16, \
-                                  ((imm) & 0x1) ?  1 : 17, \
-                                  ((imm) & 0x1) ?  2 : 18, \
-                                  ((imm) & 0x1) ?  3 : 19, \
-                                  ((imm) & 0x1) ?  4 : 20, \
-                                  ((imm) & 0x1) ?  5 : 21, \
-                                  ((imm) & 0x1) ?  6 : 22, \
-                                  ((imm) & 0x1) ?  7 : 23, \
-                                  ((imm) & 0x1) ? 16 :  8, \
-                                  ((imm) & 0x1) ? 17 :  9, \
-                                  ((imm) & 0x1) ? 18 : 10, \
-                                  ((imm) & 0x1) ? 19 : 11, \
-                                  ((imm) & 0x1) ? 20 : 12, \
-                                  ((imm) & 0x1) ? 21 : 13, \
-                                  ((imm) & 0x1) ? 22 : 14, \
-                                  ((imm) & 0x1) ? 23 : 15)
+  (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+                                     (__v8sf)(__m256)(B), (int)(imm))
 
 #define _mm512_mask_insertf32x8(W, U, A, B, imm) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@@ -1209,16 +1169,8 @@
                                  (__v16sf)_mm512_setzero_ps())
 
 #define _mm512_insertf64x2(A, B, imm) \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
-                                  (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
-                                  (((imm) & 0x3) == 0) ? 8 : 0, \
-                                  (((imm) & 0x3) == 0) ? 9 : 1, \
-                                  (((imm) & 0x3) == 1) ? 8 : 2, \
-                                  (((imm) & 0x3) == 1) ? 9 : 3, \
-                                  (((imm) & 0x3) == 2) ? 8 : 4, \
-                                  (((imm) & 0x3) == 2) ? 9 : 5, \
-                                  (((imm) & 0x3) == 3) ? 8 : 6, \
-                                  (((imm) & 0x3) == 3) ? 9 : 7)
+  (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(imm))
 
 #define _mm512_mask_insertf64x2(W, U, A, B, imm) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@@ -1231,24 +1183,8 @@
                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_inserti32x8(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
-                                 (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
-                                 ((imm) & 0x1) ?  0 : 16, \
-                                 ((imm) & 0x1) ?  1 : 17, \
-                                 ((imm) & 0x1) ?  2 : 18, \
-                                 ((imm) & 0x1) ?  3 : 19, \
-                                 ((imm) & 0x1) ?  4 : 20, \
-                                 ((imm) & 0x1) ?  5 : 21, \
-                                 ((imm) & 0x1) ?  6 : 22, \
-                                 ((imm) & 0x1) ?  7 : 23, \
-                                 ((imm) & 0x1) ? 16 :  8, \
-                                 ((imm) & 0x1) ? 17 :  9, \
-                                 ((imm) & 0x1) ? 18 : 10, \
-                                 ((imm) & 0x1) ? 19 : 11, \
-                                 ((imm) & 0x1) ? 20 : 12, \
-                                 ((imm) & 0x1) ? 21 : 13, \
-                                 ((imm) & 0x1) ? 22 : 14, \
-                                 ((imm) & 0x1) ? 23 : 15)
+  (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(imm))
 
 #define _mm512_mask_inserti32x8(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
@@ -1261,16 +1197,8 @@
                                  (__v16si)_mm512_setzero_si512())
 
 #define _mm512_inserti64x2(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
-                                  (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
-                                  (((imm) & 0x3) == 0) ? 8 : 0, \
-                                  (((imm) & 0x3) == 0) ? 9 : 1, \
-                                  (((imm) & 0x3) == 1) ? 8 : 2, \
-                                  (((imm) & 0x3) == 1) ? 9 : 3, \
-                                  (((imm) & 0x3) == 2) ? 8 : 4, \
-                                  (((imm) & 0x3) == 2) ? 9 : 5, \
-                                  (((imm) & 0x3) == 3) ? 8 : 6, \
-                                  (((imm) & 0x3) == 3) ? 9 : 7)
+  (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+                                          (__v2di)(__m128i)(B), (int)(imm))
 
 #define _mm512_mask_inserti64x2(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index ccc445a..4ae235e 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3494,12 +3494,7 @@
 /* Vector Extract */
 
 #define _mm512_extractf64x4_pd(A, I) \
-  (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A),          \
-                                   (__v8df)_mm512_undefined_pd(), \
-                                   ((I) & 1) ? 4 : 0,             \
-                                   ((I) & 1) ? 5 : 1,             \
-                                   ((I) & 1) ? 6 : 2,             \
-                                   ((I) & 1) ? 7 : 3)
+  (__m256d)__builtin_ia32_extractf64x4((__v8df)(__m512d)(A), (int)(I))
 
 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@@ -3512,12 +3507,7 @@
                                    (__v4df)_mm256_setzero_pd())
 
 #define _mm512_extractf32x4_ps(A, I) \
-  (__m128)__builtin_shufflevector((__v16sf)(__m512)(A),           \
-                                  (__v16sf)_mm512_undefined_ps(), \
-                                  0 + ((I) & 0x3) * 4,            \
-                                  1 + ((I) & 0x3) * 4,            \
-                                  2 + ((I) & 0x3) * 4,            \
-                                  3 + ((I) & 0x3) * 4)
+  (__m128)__builtin_ia32_extractf32x4((__v16sf)(__m512)(A), (int)(I))
 
 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
@@ -7544,12 +7534,7 @@
 }
 
 #define _mm512_extracti32x4_epi32(A, imm) \
-  (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A),             \
-                                   (__v16si)_mm512_undefined_epi32(), \
-                                   0 + ((imm) & 0x3) * 4,             \
-                                   1 + ((imm) & 0x3) * 4,             \
-                                   2 + ((imm) & 0x3) * 4,             \
-                                   3 + ((imm) & 0x3) * 4)
+  (__m128i)__builtin_ia32_extracti32x4((__v16si)(__m512i)(A), (int)(imm))
 
 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
   (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
@@ -7562,12 +7547,7 @@
                                 (__v4si)_mm_setzero_si128())
 
 #define _mm512_extracti64x4_epi64(A, imm) \
-  (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A),             \
-                                   (__v8di)_mm512_undefined_epi32(), \
-                                   ((imm) & 1) ? 4 : 0,              \
-                                   ((imm) & 1) ? 5 : 1,              \
-                                   ((imm) & 1) ? 6 : 2,              \
-                                   ((imm) & 1) ? 7 : 3)
+  (__m256i)__builtin_ia32_extracti64x4((__v8di)(__m512i)(A), (int)(imm))
 
 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
   (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
@@ -7580,16 +7560,8 @@
                                 (__v4di)_mm256_setzero_si256())
 
 #define _mm512_insertf64x4(A, B, imm) \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
-                                 (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
-                                 ((imm) & 0x1) ?  0 :  8, \
-                                 ((imm) & 0x1) ?  1 :  9, \
-                                 ((imm) & 0x1) ?  2 : 10, \
-                                 ((imm) & 0x1) ?  3 : 11, \
-                                 ((imm) & 0x1) ?  8 :  4, \
-                                 ((imm) & 0x1) ?  9 :  5, \
-                                 ((imm) & 0x1) ? 10 :  6, \
-                                 ((imm) & 0x1) ? 11 :  7)
+  (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
+                                      (__v4df)(__m256d)(B), (int)(imm))
 
 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@@ -7602,16 +7574,8 @@
                                   (__v8df)_mm512_setzero_pd())
 
 #define _mm512_inserti64x4(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
-                                 (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
-                                 ((imm) & 0x1) ?  0 :  8, \
-                                 ((imm) & 0x1) ?  1 :  9, \
-                                 ((imm) & 0x1) ?  2 : 10, \
-                                 ((imm) & 0x1) ?  3 : 11, \
-                                 ((imm) & 0x1) ?  8 :  4, \
-                                 ((imm) & 0x1) ?  9 :  5, \
-                                 ((imm) & 0x1) ? 10 :  6, \
-                                 ((imm) & 0x1) ? 11 :  7)
+  (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(imm))
 
 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
@@ -7624,24 +7588,8 @@
                                   (__v8di)_mm512_setzero_si512())
 
 #define _mm512_insertf32x4(A, B, imm) \
-  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
-                                  (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
-                                  (((imm) & 0x3) == 0) ? 16 :  0, \
-                                  (((imm) & 0x3) == 0) ? 17 :  1, \
-                                  (((imm) & 0x3) == 0) ? 18 :  2, \
-                                  (((imm) & 0x3) == 0) ? 19 :  3, \
-                                  (((imm) & 0x3) == 1) ? 16 :  4, \
-                                  (((imm) & 0x3) == 1) ? 17 :  5, \
-                                  (((imm) & 0x3) == 1) ? 18 :  6, \
-                                  (((imm) & 0x3) == 1) ? 19 :  7, \
-                                  (((imm) & 0x3) == 2) ? 16 :  8, \
-                                  (((imm) & 0x3) == 2) ? 17 :  9, \
-                                  (((imm) & 0x3) == 2) ? 18 : 10, \
-                                  (((imm) & 0x3) == 2) ? 19 : 11, \
-                                  (((imm) & 0x3) == 3) ? 16 : 12, \
-                                  (((imm) & 0x3) == 3) ? 17 : 13, \
-                                  (((imm) & 0x3) == 3) ? 18 : 14, \
-                                  (((imm) & 0x3) == 3) ? 19 : 15)
+  (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
+                                     (__v4sf)(__m128)(B), (int)(imm))
 
 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@@ -7654,24 +7602,8 @@
                                  (__v16sf)_mm512_setzero_ps())
 
 #define _mm512_inserti32x4(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
-                                 (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
-                                 (((imm) & 0x3) == 0) ? 16 :  0, \
-                                 (((imm) & 0x3) == 0) ? 17 :  1, \
-                                 (((imm) & 0x3) == 0) ? 18 :  2, \
-                                 (((imm) & 0x3) == 0) ? 19 :  3, \
-                                 (((imm) & 0x3) == 1) ? 16 :  4, \
-                                 (((imm) & 0x3) == 1) ? 17 :  5, \
-                                 (((imm) & 0x3) == 1) ? 18 :  6, \
-                                 (((imm) & 0x3) == 1) ? 19 :  7, \
-                                 (((imm) & 0x3) == 2) ? 16 :  8, \
-                                 (((imm) & 0x3) == 2) ? 17 :  9, \
-                                 (((imm) & 0x3) == 2) ? 18 : 10, \
-                                 (((imm) & 0x3) == 2) ? 19 : 11, \
-                                 (((imm) & 0x3) == 3) ? 16 : 12, \
-                                 (((imm) & 0x3) == 3) ? 17 : 13, \
-                                 (((imm) & 0x3) == 3) ? 18 : 14, \
-                                 (((imm) & 0x3) == 3) ? 19 : 15)
+  (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(imm))
 
 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 60571ad..8d6ff3e 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -1083,10 +1083,7 @@
 }
 
 #define _mm256_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A),           \
-                                   (__v4df)_mm256_undefined_pd(), \
-                                   ((imm) & 1) ? 2 : 0,           \
-                                   ((imm) & 1) ? 3 : 1)
+  (__m128d)__builtin_ia32_extractf64x2_256((__v4df)(__m256d)(A), (int)(imm))
 
 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
   (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
@@ -1099,10 +1096,7 @@
                                    (__v2df)_mm_setzero_pd())
 
 #define _mm256_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A),             \
-                                   (__v4di)_mm256_undefined_si256(), \
-                                   ((imm) & 1) ? 2 : 0,              \
-                                   ((imm) & 1) ? 3 : 1)
+  (__m128i)__builtin_ia32_extracti64x2_256((__v4di)(__m256i)(A), (int)(imm))
 
 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
   (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
@@ -1115,12 +1109,8 @@
                                 (__v2di)_mm_setzero_si128())
 
 #define _mm256_insertf64x2(A, B, imm) \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
-                                 (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
-                                 ((imm) & 0x1) ? 0 : 4, \
-                                 ((imm) & 0x1) ? 1 : 5, \
-                                 ((imm) & 0x1) ? 4 : 2, \
-                                 ((imm) & 0x1) ? 5 : 3)
+  (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+                                          (__v2df)(__m128d)(B), (int)(imm))
 
 #define _mm256_mask_insertf64x2(W, U, A, B, imm) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@@ -1133,12 +1123,8 @@
                                   (__v4df)_mm256_setzero_pd())
 
 #define _mm256_inserti64x2(A, B, imm) \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \
-                                 (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
-                                 ((imm) & 0x1) ? 0 : 4, \
-                                 ((imm) & 0x1) ? 1 : 5, \
-                                 ((imm) & 0x1) ? 4 : 2, \
-                                 ((imm) & 0x1) ? 5 : 3)
+  (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+                                          (__v2di)(__m128i)(B), (int)(imm))
 
 #define _mm256_mask_inserti64x2(W, U, A, B, imm) \
   (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 49198ec..d1e8152 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7699,12 +7699,7 @@
 }
 
 #define _mm256_extractf32x4_ps(A, imm) \
-  (__m128)__builtin_shufflevector((__v8sf)(__m256)(A),           \
-                                  (__v8sf)_mm256_undefined_ps(), \
-                                  ((imm) & 1) ? 4 : 0,           \
-                                  ((imm) & 1) ? 5 : 1,           \
-                                  ((imm) & 1) ? 6 : 2,           \
-                                  ((imm) & 1) ? 7 : 3)
+  (__m128)__builtin_ia32_extractf32x4_256((__v8sf)(__m256)(A), (int)(imm))
 
 #define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
   (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
@@ -7717,12 +7712,7 @@
                                    (__v4sf)_mm_setzero_ps())
 
 #define _mm256_extracti32x4_epi32(A, imm) \
-  (__m128i)__builtin_shufflevector((__v8si)(__m256)(A),              \
-                                   (__v8si)_mm256_undefined_si256(), \
-                                   ((imm) & 1) ? 4 : 0,              \
-                                   ((imm) & 1) ? 5 : 1,              \
-                                   ((imm) & 1) ? 6 : 2,              \
-                                   ((imm) & 1) ? 7 : 3)
+  (__m128i)__builtin_ia32_extracti32x4_256((__v8si)(__m256i)(A), (int)(imm))
 
 #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
   (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
@@ -7735,16 +7725,8 @@
                                 (__v4si)_mm_setzero_si128())
 
 #define _mm256_insertf32x4(A, B, imm) \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
-                                  (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
-                                  ((imm) & 0x1) ?  0 :  8, \
-                                  ((imm) & 0x1) ?  1 :  9, \
-                                  ((imm) & 0x1) ?  2 : 10, \
-                                  ((imm) & 0x1) ?  3 : 11, \
-                                  ((imm) & 0x1) ?  8 :  4, \
-                                  ((imm) & 0x1) ?  9 :  5, \
-                                  ((imm) & 0x1) ? 10 :  6, \
-                                  ((imm) & 0x1) ? 11 :  7)
+  (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
+                                         (__v4sf)(__m128)(B), (int)(imm))
 
 #define _mm256_mask_insertf32x4(W, U, A, B, imm) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
@@ -7757,16 +7739,8 @@
                                   (__v8sf)_mm256_setzero_ps())
 
 #define _mm256_inserti32x4(A, B, imm) \
-  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(A), \
-                                 (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
-                                 ((imm) & 0x1) ?  0 :  8, \
-                                 ((imm) & 0x1) ?  1 :  9, \
-                                 ((imm) & 0x1) ?  2 : 10, \
-                                 ((imm) & 0x1) ?  3 : 11, \
-                                 ((imm) & 0x1) ?  8 :  4, \
-                                 ((imm) & 0x1) ?  9 :  5, \
-                                 ((imm) & 0x1) ? 10 :  6, \
-                                 ((imm) & 0x1) ? 11 :  7)
+  (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
+                                          (__v4si)(__m128i)(B), (int)(imm))
 
 #define _mm256_mask_inserti32x4(W, U, A, B, imm) \
   (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 7c85893..6c42132 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -4613,17 +4613,8 @@
 ///    result.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 #define _mm256_insertf128_ps(V1, V2, M) \
-  (__m256)__builtin_shufflevector( \
-    (__v8sf)(__m256)(V1), \
-    (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
-    (((M) & 1) ?  0 :  8), \
-    (((M) & 1) ?  1 :  9), \
-    (((M) & 1) ?  2 : 10), \
-    (((M) & 1) ?  3 : 11), \
-    (((M) & 1) ?  8 :  4), \
-    (((M) & 1) ?  9 :  5), \
-    (((M) & 1) ? 10 :  6), \
-    (((M) & 1) ? 11 :  7) )
+  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+                                           (__v4sf)(__m128)(V2), (int)(M))
 
 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
@@ -4660,13 +4651,8 @@
 ///    result.
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 #define _mm256_insertf128_pd(V1, V2, M) \
-  (__m256d)__builtin_shufflevector( \
-    (__v4df)(__m256d)(V1), \
-    (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
-    (((M) & 1) ? 0 : 4), \
-    (((M) & 1) ? 1 : 5), \
-    (((M) & 1) ? 4 : 2), \
-    (((M) & 1) ? 5 : 3) )
+  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+                                            (__v2df)(__m128d)(V2), (int)(M))
 
 /// Constructs a new 256-bit integer vector by first duplicating a
 ///    256-bit integer vector given in the first parameter, and then replacing
@@ -4703,13 +4689,8 @@
 ///    result.
 /// \returns A 256-bit integer vector containing the interleaved values.
 #define _mm256_insertf128_si256(V1, V2, M) \
-  (__m256i)__builtin_shufflevector( \
-    (__v4di)(__m256i)(V1), \
-    (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
-    (((M) & 1) ? 0 : 4), \
-    (((M) & 1) ? 1 : 5), \
-    (((M) & 1) ? 4 : 2), \
-    (((M) & 1) ? 5 : 3) )
+  (__m256i)__builtin_ia32_vinsertf128_si256((__v4di)(__m256i)(V1), \
+                                            (__v2di)(__m128i)(V2), (int)(M))
 
 /*
    Vector extract.
@@ -4738,13 +4719,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
 #define _mm256_extractf128_ps(V, M) \
-  (__m128)__builtin_shufflevector( \
-    (__v8sf)(__m256)(V), \
-    (__v8sf)(_mm256_undefined_ps()), \
-    (((M) & 1) ? 4 : 0), \
-    (((M) & 1) ? 5 : 1), \
-    (((M) & 1) ? 6 : 2), \
-    (((M) & 1) ? 7 : 3) )
+  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
 
 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
 ///    of [4 x double], as determined by the immediate integer parameter, and
@@ -4768,11 +4743,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
 #define _mm256_extractf128_pd(V, M) \
-  (__m128d)__builtin_shufflevector( \
-    (__v4df)(__m256d)(V), \
-    (__v4df)(_mm256_undefined_pd()), \
-    (((M) & 1) ? 2 : 0), \
-    (((M) & 1) ? 3 : 1) )
+  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
 
 /// Extracts either the upper or the lower 128 bits from a 256-bit
 ///    integer vector, as determined by the immediate integer parameter, and
@@ -4796,11 +4767,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit integer vector containing the extracted bits.
 #define _mm256_extractf128_si256(V, M) \
-  (__m128i)__builtin_shufflevector( \
-    (__v4di)(__m256i)(V), \
-    (__v4di)(_mm256_undefined_si256()), \
-    (((M) & 1) ? 2 : 0), \
-    (((M) & 1) ? 3 : 1) )
+  (__m128i)__builtin_ia32_vextractf128_si256((__v4di)(__m256i)(V), (int)(M))
 
 /* SIMD load ops (unaligned) */
 /// Loads two 128-bit floating-point vectors of [4 x float] from
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d16921c..4f54059 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2607,9 +2607,33 @@
     return false;
   case X86::BI__builtin_ia32_vec_ext_v2si:
   case X86::BI__builtin_ia32_vec_ext_v2di:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_extractf64x4:
+  case X86::BI__builtin_ia32_extracti64x4:
+  case X86::BI__builtin_ia32_extractf32x8:
+  case X86::BI__builtin_ia32_extracti32x8:
+  case X86::BI__builtin_ia32_extractf64x2_256:
+  case X86::BI__builtin_ia32_extracti64x2_256:
+  case X86::BI__builtin_ia32_extractf32x4_256:
+  case X86::BI__builtin_ia32_extracti32x4_256:
     i = 1; l = 0; u = 1;
     break;
   case X86::BI__builtin_ia32_vec_set_v2di:
+  case X86::BI__builtin_ia32_vinsertf128_pd256:
+  case X86::BI__builtin_ia32_vinsertf128_ps256:
+  case X86::BI__builtin_ia32_vinsertf128_si256:
+  case X86::BI__builtin_ia32_insert128i256:
+  case X86::BI__builtin_ia32_insertf32x8:
+  case X86::BI__builtin_ia32_inserti32x8:
+  case X86::BI__builtin_ia32_insertf64x4:
+  case X86::BI__builtin_ia32_inserti64x4:
+  case X86::BI__builtin_ia32_insertf64x2_256:
+  case X86::BI__builtin_ia32_inserti64x2_256:
+  case X86::BI__builtin_ia32_insertf32x4_256:
+  case X86::BI__builtin_ia32_inserti32x4_256:
     i = 2; l = 0; u = 1;
     break;
   case X86::BI__builtin_ia32_vpermilpd:
@@ -2617,6 +2641,10 @@
   case X86::BI__builtin_ia32_vec_ext_v4si:
   case X86::BI__builtin_ia32_vec_ext_v4sf:
   case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_extractf32x4:
+  case X86::BI__builtin_ia32_extracti32x4:
+  case X86::BI__builtin_ia32_extractf64x2_512:
+  case X86::BI__builtin_ia32_extracti64x2_512:
     i = 1; l = 0; u = 3;
     break;
   case X86::BI_mm_prefetch:
@@ -2633,6 +2661,10 @@
   case X86::BI__builtin_ia32_shuf_f64x2_256:
   case X86::BI__builtin_ia32_shuf_i32x4_256:
   case X86::BI__builtin_ia32_shuf_i64x2_256:
+  case X86::BI__builtin_ia32_insertf64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_insertf32x4:
+  case X86::BI__builtin_ia32_inserti32x4:
     i = 2; l = 0; u = 3;
     break;
   case X86::BI__builtin_ia32_vpermil2pd:
diff --git a/clang/test/CodeGen/avx-builtins.c b/clang/test/CodeGen/avx-builtins.c
index cb4b103..5743e36 100644
--- a/clang/test/CodeGen/avx-builtins.c
+++ b/clang/test/CodeGen/avx-builtins.c
@@ -342,19 +342,19 @@
 
 __m128d test_mm256_extractf128_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_extractf128_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf128_pd(A, 1);
 }
 
 __m128 test_mm256_extractf128_ps(__m256 A) {
   // CHECK-LABEL: test_mm256_extractf128_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_ps(A, 1);
 }
 
 __m128i test_mm256_extractf128_si256(__m256i A) {
   // CHECK-LABEL: test_mm256_extractf128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(A, 1);
 }
 
@@ -420,22 +420,22 @@
 
 __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
   // CHECK-LABEL: test_mm256_insertf128_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_insertf128_pd(A, B, 0);
 }
 
 __m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
   // CHECK-LABEL: test_mm256_insertf128_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_ps(A, B, 1);
 }
 
 __m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
   // CHECK-LABEL: test_mm256_insertf128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   return _mm256_insertf128_si256(A, B, 0);
 }
 
@@ -486,7 +486,7 @@
   // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
   // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_loadu2_m128(A, B);
 }
@@ -496,7 +496,7 @@
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_loadu2_m128d(A, B);
 }
@@ -506,8 +506,8 @@
   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
   // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_loadu2_m128i(A, B);
 }
 
@@ -1169,7 +1169,7 @@
   // CHECK-LABEL: test_mm256_storeu2_m128
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128(A, B, C);
 }
@@ -1178,7 +1178,7 @@
   // CHECK-LABEL: test_mm256_storeu2_m128d
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128d(A, B, C);
 }
@@ -1187,7 +1187,7 @@
   // CHECK-LABEL: test_mm256_storeu2_m128i
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128i(A, B, C);
 }
diff --git a/clang/test/CodeGen/avx-shuffle-builtins.c b/clang/test/CodeGen/avx-shuffle-builtins.c
index b5bac89..0830015 100644
--- a/clang/test/CodeGen/avx-shuffle-builtins.c
+++ b/clang/test/CodeGen/avx-shuffle-builtins.c
@@ -103,7 +103,7 @@
 
 __m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
   // CHECK-LABEL: @test_mm256_insertf128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
+  // CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   return _mm256_insertf128_si256(a, b, 0);
 }
 
@@ -121,7 +121,7 @@
 
 __m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
   // CHECK-LABEL: @test_mm256_insertf128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_si256(a, b, 1);
 }
 
@@ -141,7 +141,7 @@
 
 __m128i test_mm256_extractf128_si256_0(__m256i a) {
   // CHECK-LABEL: @test_mm256_extractf128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
+  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
   return _mm256_extractf128_si256(a, 0);
 }
 
@@ -159,7 +159,7 @@
 
 __m128i test_mm256_extractf128_si256_1(__m256i a) {
   // CHECK-LABEL: @test_mm256_extractf128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
+  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(a, 1);
 }
 
diff --git a/clang/test/CodeGen/avx2-builtins.c b/clang/test/CodeGen/avx2-builtins.c
index bd7e3a9..72b4349 100644
--- a/clang/test/CodeGen/avx2-builtins.c
+++ b/clang/test/CodeGen/avx2-builtins.c
@@ -386,21 +386,21 @@
 
 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
   // CHECK-LABEL: test0_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
 
 __m128i test1_mm256_extracti128_si256_1(__m256i a) {
   // CHECK-LABEL: test1_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti128_si256(a, 1);
 }
 
 // Immediate should be truncated to one bit.
 __m128i test2_mm256_extracti128_si256(__m256i a) {
   // CHECK-LABEL: test2_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
-  return _mm256_extracti128_si256(a, 2);
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  return _mm256_extracti128_si256(a, 0);
 }
 
 __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
@@ -657,14 +657,14 @@
 
 __m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test0_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test1_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti128_si256(a, b, 1);
 }
@@ -672,9 +672,9 @@
 // Immediate should be truncated to one bit.
 __m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
   // CHECK-LABEL: test2_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  return _mm256_inserti128_si256(a, b, 2);
+  return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
diff --git a/clang/test/CodeGen/avx512dq-builtins.c b/clang/test/CodeGen/avx512dq-builtins.c
index 756ecf3..f5c4b26 100644
--- a/clang/test/CodeGen/avx512dq-builtins.c
+++ b/clang/test/CodeGen/avx512dq-builtins.c
@@ -1075,80 +1075,80 @@
 
 __m256 test_mm512_extractf32x8_ps(__m512 __A) {
   // CHECK-LABEL: @test_mm512_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extractf32x8_ps(__A, 1); 
 }
 
 __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
 }
 
 __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
 }
 
 __m128d test_mm512_extractf64x2_pd(__m512d __A) {
   // CHECK-LABEL: @test_mm512_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extractf64x2_pd(__A, 3); 
 }
 
 __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
 }
 
 __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
   // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti32x8_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x8_epi32(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
 }
 
 __m128i test_mm512_extracti64x2_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   return _mm512_extracti64x2_epi64(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <2 x i32> <i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
 }
diff --git a/clang/test/CodeGen/avx512f-builtins.c b/clang/test/CodeGen/avx512f-builtins.c
index 0ed5b7b..52554ba 100644
--- a/clang/test/CodeGen/avx512f-builtins.c
+++ b/clang/test/CodeGen/avx512f-builtins.c
@@ -1508,20 +1508,20 @@
 __m256d test_mm512_extractf64x4_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf64x4_pd(a, 1);
 }
 
 __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
 }
 
 __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
 }
@@ -1529,20 +1529,20 @@
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf32x4_ps(a, 1);
 }
 
 __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
 }
 
 __m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512d __A){
   // CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
  return _mm512_maskz_extractf32x4_ps(  __U, __A, 1);
 }
@@ -5514,40 +5514,40 @@
 
 __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   return _mm512_extracti32x4_epi32(__A, 3); 
 }
 
 __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
 }
 
 __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
 }
 
 __m256i test_mm512_extracti64x4_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extracti64x4_epi64(__A, 1); 
 }
 
 __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
 }
 
 __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
 }
diff --git a/clang/test/CodeGen/avx512vl-builtins.c b/clang/test/CodeGen/avx512vl-builtins.c
index 27dca97..1edc25d 100644
--- a/clang/test/CodeGen/avx512vl-builtins.c
+++ b/clang/test/CodeGen/avx512vl-builtins.c
@@ -7101,40 +7101,40 @@
 
 __m128 test_mm256_extractf32x4_ps(__m256 __A) {
   // CHECK-LABEL: @test_mm256_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf32x4_ps(__A, 1); 
 }
 
 __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
 }
 
 __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf32x4_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti32x4_epi32(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extracti32x4_epi32(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti32x4_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
 }
diff --git a/clang/test/CodeGen/avx512vldq-builtins.c b/clang/test/CodeGen/avx512vldq-builtins.c
index ea9db6e..31ae623 100644
--- a/clang/test/CodeGen/avx512vldq-builtins.c
+++ b/clang/test/CodeGen/avx512vldq-builtins.c
@@ -1024,40 +1024,40 @@
 
 __m128d test_mm256_extractf64x2_pd(__m256d __A) {
   // CHECK-LABEL: @test_mm256_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf64x2_pd(__A, 1); 
 }
 
 __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
 }
 
 __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
   // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
 }
 
 __m128i test_mm256_extracti64x2_epi64(__m256i __A) {
   // CHECK-LABEL: @test_mm256_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti64x2_epi64(__A, 1); 
 }
 
 __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
 }
 
 __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
 }