Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during                                                                                                                                    
isel lowering to fold the zero-extend's and take advantage of no-stall
back to back vmul + vmla:
 vmull q0, d4, d6
 vmlal q0, d5, d6
is faster than
 vaddl q0, d4, d5
 vmovl q1, d6                                                                                                                                                                             
 vmul  q0, q0, q1

This allows us to vmull + vmlal for:
    f = vmull_u8(   vget_high_u8(s), c);
    f = vmlal_u8(f, vget_low_u8(s),  c);

rdar://9197392


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@128444 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index ee033ca..585394e 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -339,3 +339,32 @@
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
   ret <2 x i64> %tmp4
 }
+
+; rdar://9197392
+define void @distribue(i16* %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK: distribue:
+; CHECK: vmull.u8 [[REG1:(q[0-9]+)]], d{{.*}}, [[REG2:(d[0-9]+)]]
+; CHECK: vmlal.u8 [[REG1]], d{{.*}}, [[REG2]]
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = zext <8 x i8> %6 to <8 x i16>
+  %8 = zext <8 x i8> %2 to <8 x i16>
+  %9 = extractelement <2 x double> %4, i32 0
+  %10 = bitcast double %9 to <8 x i8>
+  %11 = zext <8 x i8> %10 to <8 x i16>
+  %12 = add <8 x i16> %7, %11
+  %13 = mul <8 x i16> %12, %8
+  %14 = bitcast i16* %dst to i8*
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
+  ret void
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind