Re-commit r355104: "[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1."
The code to materialize a mask from a constant pool load tried to use a 128 bit
LDR to load a 64 bit constant pool entry, which was 8 byte aligned. This resulted
in a link failure in the NEON tests in the test suite since the LDR address was
unaligned. This change fixes that to instead emit a 64 bit LDR if the entry is
64 bit, before converting back to a 128 bit register for the TBL.
llvm-svn: 355326
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 2a5599f..41f4eb95 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -67,7 +67,7 @@
// Helper to generate an equivalent of scalar_to_vector into a new register,
// returned via 'Dst'.
- MachineInstr *emitScalarToVector(const LLT DstTy,
+ MachineInstr *emitScalarToVector(unsigned EltSize,
const TargetRegisterClass *DstRC,
unsigned Scalar,
MachineIRBuilder &MIRBuilder) const;
@@ -82,6 +82,8 @@
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
+ MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
+ MachineIRBuilder &MIRBuilder) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
@@ -1713,7 +1715,7 @@
}
MachineInstr *AArch64InstructionSelector::emitScalarToVector(
- const LLT DstTy, const TargetRegisterClass *DstRC, unsigned Scalar,
+ unsigned EltSize, const TargetRegisterClass *DstRC, unsigned Scalar,
MachineIRBuilder &MIRBuilder) const {
auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
@@ -1727,7 +1729,7 @@
return &*Ins;
};
- switch (DstTy.getElementType().getSizeInBits()) {
+ switch (EltSize) {
case 16:
return BuildFn(AArch64::hsub);
case 32:
@@ -1957,13 +1959,123 @@
auto Adrp =
MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
.addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
- auto Load =
- MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
- .addConstantPoolIndex(CPIdx, 0,
- AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+ MachineInstr *LoadMI = nullptr;
+ switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
+ case 16:
+ LoadMI =
+ &*MIRBuilder
+ .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
+ .addConstantPoolIndex(CPIdx, 0,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ break;
+ case 8:
+ LoadMI = &*MIRBuilder
+ .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
+ .addConstantPoolIndex(
+ CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ break;
+ default:
+ LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
+ << *CPVal->getType());
+ return nullptr;
+ }
constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
- constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
- return &*Load;
+ constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
+ return LoadMI;
+}
+
+/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
+/// size and RB.
+static std::pair<unsigned, unsigned>
+getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
+ unsigned Opc, SubregIdx;
+ if (RB.getID() == AArch64::GPRRegBankID) {
+ if (EltSize == 32) {
+ Opc = AArch64::INSvi32gpr;
+ SubregIdx = AArch64::ssub;
+ } else if (EltSize == 64) {
+ Opc = AArch64::INSvi64gpr;
+ SubregIdx = AArch64::dsub;
+ } else {
+ llvm_unreachable("invalid elt size!");
+ }
+ } else {
+ if (EltSize == 8) {
+ Opc = AArch64::INSvi8lane;
+ SubregIdx = AArch64::bsub;
+ } else if (EltSize == 16) {
+ Opc = AArch64::INSvi16lane;
+ SubregIdx = AArch64::hsub;
+ } else if (EltSize == 32) {
+ Opc = AArch64::INSvi32lane;
+ SubregIdx = AArch64::ssub;
+ } else if (EltSize == 64) {
+ Opc = AArch64::INSvi64lane;
+ SubregIdx = AArch64::dsub;
+ } else {
+ llvm_unreachable("invalid elt size!");
+ }
+ }
+ return std::make_pair(Opc, SubregIdx);
+}
+
+MachineInstr *AArch64InstructionSelector::emitVectorConcat(
+ unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
+ // We implement a vector concat by:
+ // 1. Use scalar_to_vector to insert the lower vector into the larger dest
+ // 2. Insert the upper vector into the destination's upper element
+ // TODO: some of this code is common with G_BUILD_VECTOR handling.
+ MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+
+ const LLT Op1Ty = MRI.getType(Op1);
+ const LLT Op2Ty = MRI.getType(Op2);
+
+ if (Op1Ty != Op2Ty) {
+ LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
+ return nullptr;
+ }
+ assert(Op1Ty.isVector() && "Expected a vector for vector concat");
+
+ if (Op1Ty.getSizeInBits() >= 128) {
+ LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
+ return nullptr;
+ }
+
+ // At the moment we just support 64 bit vector concats.
+ if (Op1Ty.getSizeInBits() != 64) {
+ LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
+ return nullptr;
+ }
+
+ const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
+ const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
+
+ MachineInstr *WidenedOp1 =
+ emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
+ MachineInstr *WidenedOp2 =
+ emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
+ if (!WidenedOp1 || !WidenedOp2) {
+ LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
+ return nullptr;
+ }
+
+ // Now do the insert of the upper element.
+ unsigned InsertOpc, InsSubRegIdx;
+ std::tie(InsertOpc, InsSubRegIdx) =
+ getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
+
+ auto InsElt =
+ MIRBuilder
+ .buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
+ .addImm(1) /* Lane index */
+ .addUse(WidenedOp2->getOperand(0).getReg())
+ .addImm(0);
+
+ constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
+ return &*InsElt;
}
bool AArch64InstructionSelector::selectShuffleVector(
@@ -2003,21 +2115,43 @@
}
}
- if (DstTy.getSizeInBits() != 128) {
- assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
- // This case can be done with TBL1.
- return false;
- }
+ MachineIRBuilder MIRBuilder(I);
// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
- MachineIRBuilder MIRBuilder(I);
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
}
+ if (DstTy.getSizeInBits() != 128) {
+ assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
+ // This case can be done with TBL1.
+ MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
+ if (!Concat) {
+ LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
+ return false;
+ }
+
+ // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
+ IndexLoad =
+ emitScalarToVector(64, &AArch64::FPR128RegClass,
+ IndexLoad->getOperand(0).getReg(), MIRBuilder);
+
+ auto TBL1 = MIRBuilder.buildInstr(
+ AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
+ {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
+ constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
+
+ auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
+ .addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
+ RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
+ I.eraseFromParent();
+ return true;
+ }
+
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
auto RegSeq = MIRBuilder
@@ -2049,32 +2183,15 @@
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
unsigned Opc;
unsigned SubregIdx;
- if (RB.getID() == AArch64::GPRRegBankID) {
- if (EltSize == 32) {
- Opc = AArch64::INSvi32gpr;
- SubregIdx = AArch64::ssub;
- } else {
- Opc = AArch64::INSvi64gpr;
- SubregIdx = AArch64::dsub;
- }
- } else {
- if (EltSize == 16) {
- Opc = AArch64::INSvi16lane;
- SubregIdx = AArch64::hsub;
- } else if (EltSize == 32) {
- Opc = AArch64::INSvi32lane;
- SubregIdx = AArch64::ssub;
- } else {
- Opc = AArch64::INSvi64lane;
- SubregIdx = AArch64::dsub;
- }
- }
+
+ std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
MachineIRBuilder MIRBuilder(I);
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineInstr *ScalarToVec =
- emitScalarToVector(DstTy, DstRC, I.getOperand(1).getReg(), MIRBuilder);
+ emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
+ I.getOperand(1).getReg(), MIRBuilder);
if (!ScalarToVec)
return false;