Make this kind of lowering to be supported by 256-bit instructions:
  shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
To:
  shuffle (vload ptr)), undef, <1, 1, 1, 1>
Fix PR10494

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@136691 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/X86/avx-256-splat.ll b/test/CodeGen/X86/avx-256-splat.ll
index edc17b7..36d4694 100644
--- a/test/CodeGen/X86/avx-256-splat.ll
+++ b/test/CodeGen/X86/avx-256-splat.ll
@@ -45,3 +45,35 @@
   %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
   ret <4 x double> %vecinit6.i
 }
+
+; Test this simple opt:
+;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+; To:
+;   shuffle (vload ptr)), undef, <1, 1, 1, 1>
+; CHECK: vmovaps
+; CHECK-NEXT: vpextrd
+define void @funcE() nounwind {
+allocas:
+  %udx495 = alloca [18 x [18 x float]], align 32
+  br label %for_test505.preheader
+
+for_test505.preheader:                            ; preds = %for_test505.preheader, %allocas
+  br i1 undef, label %for_exit499, label %for_test505.preheader
+
+for_exit499:                                      ; preds = %for_test505.preheader
+  br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
+
+load.i1247:                                       ; preds = %for_exit499
+  %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
+  %ptr.i1237 = bitcast float* %ptr1227 to i32*
+  %val.i1238 = load i32* %ptr.i1237, align 4
+  %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
+  %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
+  %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
+  br label %__load_and_broadcast_32.exit1249
+
+__load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_exit499
+  %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
+  ret void
+}
+