AMDGPU: Stop wasting argument registers with v3i32/v3f32

SelectionDAGBuilder widens v3i32/v3f32 arguments to
to v4i32/v4f32 which consume an additional register.
In addition to wasting argument space, this produces extra
instructions since now it appears the 4th vector component has
a meaningful value to most combines.

llvm-svn: 338197
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b7fc26..97c38e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -694,6 +694,52 @@
   return false;
 }
 
+MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                    CallingConv::ID CC,
+                                                    EVT VT) const {
+  if (CC != CallingConv::AMDGPU_KERNEL &&
+      VT.isVector() && VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32)
+      return ScalarVT.getSimpleVT();
+  }
+
+  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                         CallingConv::ID CC,
+                                                         EVT VT) const {
+  if (CC != CallingConv::AMDGPU_KERNEL &&
+      VT.isVector() && VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32)
+      return 3;
+  }
+
+  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
+  LLVMContext &Context, CallingConv::ID CC,
+  EVT VT, EVT &IntermediateVT,
+  unsigned &NumIntermediates, MVT &RegisterVT) const {
+
+  if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32 ||
+        ScalarVT.getSizeInBits() == 64) {
+      RegisterVT = ScalarVT.getSimpleVT();
+      IntermediateVT = RegisterVT;
+      NumIntermediates = 3;
+      return NumIntermediates;
+    }
+  }
+
+  return TargetLowering::getVectorTypeBreakdownForCallingConv(
+    Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,