AMDGPU: Stop wasting argument registers with v3i32/v3f32
SelectionDAGBuilder widens v3i32/v3f32 arguments to
to v4i32/v4f32 which consume an additional register.
In addition to wasting argument space, this produces extra
instructions since now it appears the 4th vector component has
a meaningful value to most combines.
llvm-svn: 338197
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b7fc26..97c38e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -694,6 +694,52 @@
return false;
}
+MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL &&
+ VT.isVector() && VT.getVectorNumElements() == 3) {
+ EVT ScalarVT = VT.getScalarType();
+ if (ScalarVT.getSizeInBits() == 32)
+ return ScalarVT.getSimpleVT();
+ }
+
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ if (CC != CallingConv::AMDGPU_KERNEL &&
+ VT.isVector() && VT.getVectorNumElements() == 3) {
+ EVT ScalarVT = VT.getScalarType();
+ if (ScalarVT.getSizeInBits() == 32)
+ return 3;
+ }
+
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+}
+
+unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC,
+ EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) {
+ EVT ScalarVT = VT.getScalarType();
+ if (ScalarVT.getSizeInBits() == 32 ||
+ ScalarVT.getSizeInBits() == 64) {
+ RegisterVT = ScalarVT.getSimpleVT();
+ IntermediateVT = RegisterVT;
+ NumIntermediates = 3;
+ return NumIntermediates;
+ }
+ }
+
+ return TargetLowering::getVectorTypeBreakdownForCallingConv(
+ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,