[SystemZ] Handle sub-128 vectors

The ABI allows sub-128 vectors to be passed and returned in registers,
with the vector occupying the upper part of a register.  We therefore
want to legalize those types by widening the vector rather than promoting
the elements.

The patch includes some simple tests for sub-128 vectors and also tests
that we can recognize various pack sequences, some of which use sub-128
vectors as temporary results.  One of these forms is based on the pack
sequences generated by llvmpipe when no intrinsics are used.

Signed unpacks are recognized as BUILD_VECTORs whose elements are
individually sign-extended.  Unsigned unpacks can have the equivalent
form with zero extension, but they also occur as shuffles in which some
elements are zero.

Based on a patch by Richard Sandiford.

llvm-svn: 236525
diff --git a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
index f9da34b..a359344 100644
--- a/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-combine-01.ll
@@ -105,3 +105,51 @@
   %res = add i16 %elem1, %elem2
   ret i16 %res
 }
+
+; Test a case where an unpack high can be eliminated from the usual
+; load-extend sequence.
+define void @f6(<8 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
+; CHECK-LABEL: f6:
+; CHECK: vlrepg [[REG:%v[0-9]+]], 0(%r2)
+; CHECK-NOT: vup
+; CHECK-DAG: vsteb [[REG]], 0(%r3), 1
+; CHECK-DAG: vsteb [[REG]], 0(%r4), 2
+; CHECK-DAG: vsteb [[REG]], 0(%r5), 7
+; CHECK: br %r14
+  %vec = load <8 x i8>, <8 x i8> *%ptr1
+  %ext = sext <8 x i8> %vec to <8 x i16>
+  %elem1 = extractelement <8 x i16> %ext, i32 1
+  %elem2 = extractelement <8 x i16> %ext, i32 2
+  %elem3 = extractelement <8 x i16> %ext, i32 7
+  %trunc1 = trunc i16 %elem1 to i8
+  %trunc2 = trunc i16 %elem2 to i8
+  %trunc3 = trunc i16 %elem3 to i8
+  store i8 %trunc1, i8 *%ptr2
+  store i8 %trunc2, i8 *%ptr3
+  store i8 %trunc3, i8 *%ptr4
+  ret void
+}
+
+; ...and again with a bitcast inbetween.
+define void @f7(<4 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) {
+; CHECK-LABEL: f7:
+; CHECK: vlrepf [[REG:%v[0-9]+]], 0(%r2)
+; CHECK-NOT: vup
+; CHECK-DAG: vsteb [[REG]], 0(%r3), 0
+; CHECK-DAG: vsteb [[REG]], 0(%r4), 1
+; CHECK-DAG: vsteb [[REG]], 0(%r5), 3
+; CHECK: br %r14
+  %vec = load <4 x i8>, <4 x i8> *%ptr1
+  %ext = sext <4 x i8> %vec to <4 x i32>
+  %bitcast = bitcast <4 x i32> %ext to <8 x i16>
+  %elem1 = extractelement <8 x i16> %bitcast, i32 1
+  %elem2 = extractelement <8 x i16> %bitcast, i32 3
+  %elem3 = extractelement <8 x i16> %bitcast, i32 7
+  %trunc1 = trunc i16 %elem1 to i8
+  %trunc2 = trunc i16 %elem2 to i8
+  %trunc3 = trunc i16 %elem3 to i8
+  store i8 %trunc1, i8 *%ptr2
+  store i8 %trunc2, i8 *%ptr3
+  store i8 %trunc3, i8 *%ptr4
+  ret void
+}