ARM: use LLVM IR to represent the vshrn operation

vshrn is just the combination of a right shift and a truncate (and the limits
on the immediate value actually mean the signedness of the shift doesn't
matter). Using that representation allows us to get rid of an ARM-specific
intrinsic, share more code with AArch64 and hopefully get better code out of
the mid-end optimisers.

llvm-svn: 201085
diff --git a/llvm/test/CodeGen/ARM/reg_sequence.ll b/llvm/test/CodeGen/ARM/reg_sequence.ll
index 25484f4..b245674 100644
--- a/llvm/test/CodeGen/ARM/reg_sequence.ll
+++ b/llvm/test/CodeGen/ARM/reg_sequence.ll
@@ -34,9 +34,11 @@
   %12 = sext <4 x i16> %11 to <4 x i32>           ; <<4 x i32>> [#uses=1]
   %13 = mul <4 x i32> %1, %9                      ; <<4 x i32>> [#uses=1]
   %14 = mul <4 x i32> %3, %12                     ; <<4 x i32>> [#uses=1]
-  %15 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %13, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
-  %16 = tail call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %14, <4 x i32> <i32 -12, i32 -12, i32 -12, i32 -12>) ; <<4 x i16>> [#uses=1]
-  %17 = shufflevector <4 x i16> %15, <4 x i16> %16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
+  %15 = lshr <4 x i32> %13, <i32 12, i32 12, i32 12, i32 12>
+  %trunc_15 = trunc <4 x i32> %15 to <4 x i16>
+  %16 = lshr <4 x i32> %14, <i32 12, i32 12, i32 12, i32 12>
+  %trunc_16 = trunc <4 x i32> %16 to <4 x i16>
+  %17 = shufflevector <4 x i16> %trunc_15, <4 x i16> %trunc_16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ; <<8 x i16>> [#uses=1]
   %18 = bitcast i16* %o_ptr to i8*                ; <i8*> [#uses=1]
   tail call void @llvm.arm.neon.vst1.v8i16(i8* %18, <8 x i16> %17, i32 1)
   ret void
diff --git a/llvm/test/CodeGen/ARM/vshrn.ll b/llvm/test/CodeGen/ARM/vshrn.ll
index 40a94fe..cc936be 100644
--- a/llvm/test/CodeGen/ARM/vshrn.ll
+++ b/llvm/test/CodeGen/ARM/vshrn.ll
@@ -4,29 +4,58 @@
 ;CHECK-LABEL: vshrns8:
 ;CHECK: vshrn.i16
 	%tmp1 = load <8 x i16>* %A
-	%tmp2 = call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >)
-	ret <8 x i8> %tmp2
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+	ret <8 x i8> %tmp3
 }
 
 define <4 x i16> @vshrns16(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: vshrns16:
 ;CHECK: vshrn.i32
 	%tmp1 = load <4 x i32>* %A
-	%tmp2 = call <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >)
-	ret <4 x i16> %tmp2
+        %tmp2 = ashr <4 x i32> %tmp1, <i32 16, i32 16, i32 16, i32 16>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+	ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind {
 ;CHECK-LABEL: vshrns32:
 ;CHECK: vshrn.i64
 	%tmp1 = load <2 x i64>* %A
-	%tmp2 = call <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >)
-	ret <2 x i32> %tmp2
+        %tmp2 = ashr <2 x i64> %tmp1, <i64 32, i64 32>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+	ret <2 x i32> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+define <8 x i8> @vshrns8_bad(<8 x i16>* %A) nounwind {
+; CHECK-LABEL: vshrns8_bad:
+; CHECK: vshr.s16
+; CHECK: vmovn.i16
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = ashr <8 x i16> %tmp1, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vshrns16_bad(<4 x i32>* %A) nounwind {
+; CHECK-LABEL: vshrns16_bad:
+; CHECK: vshr.u32
+; CHECK: vmovn.i32
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 17, i32 17, i32 17, i32 17>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vshrns32_bad(<2 x i64>* %A) nounwind {
+; CHECK-LABEL: vshrns32_bad:
+; CHECK: vshr.u64
+; CHECK: vmovn.i64
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 33, i64 33>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
 
 define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: vrshrns8: