Re-commit r355104: "[AArch64][GlobalISel] Add support for 64 bit vector shuffle using TBL1." The code to materialize a mask from a constant pool load tried to use a 128 bit LDR to load a 64 bit constant pool entry, which was 8 byte aligned. This resulted in a link failure in the NEON tests in the test suite since the LDR address was unaligned. This change fixes that to instead emit a 64 bit LDR if the entry is 64 bit, before converting back to a 128 bit register for the TBL. llvm-svn: 355326

commit: 8acb0d9c82ed49512ba2df6e4a0e3ee65a220fdf [log] [tgz]
author: Amara Emerson <aemerson@apple.com> Mon Mar 04 19:16:00 2019 +0000
committer: Amara Emerson <aemerson@apple.com> Mon Mar 04 19:16:00 2019 +0000
tree: 282b6336419bda3f1510960aabf1d156f641f8e3
parent: 05e233507697ab81007a6b48728add2ee4627e5b [diff] [blame]
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 2a5599f..41f4eb95 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp

@@ -67,7 +67,7 @@
 
   // Helper to generate an equivalent of scalar_to_vector into a new register,
   // returned via 'Dst'.
-  MachineInstr *emitScalarToVector(const LLT DstTy,
+  MachineInstr *emitScalarToVector(unsigned EltSize,
                                    const TargetRegisterClass *DstRC,
                                    unsigned Scalar,
                                    MachineIRBuilder &MIRBuilder) const;
@@ -82,6 +82,8 @@
   unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
   MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
                                          MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *emitVectorConcat(unsigned Op1, unsigned Op2,
+                                 MachineIRBuilder &MIRBuilder) const;
 
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 
@@ -1713,7 +1715,7 @@
 }
 
 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
-    const LLT DstTy, const TargetRegisterClass *DstRC, unsigned Scalar,
+    unsigned EltSize, const TargetRegisterClass *DstRC, unsigned Scalar,
     MachineIRBuilder &MIRBuilder) const {
   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
 
@@ -1727,7 +1729,7 @@
     return &*Ins;
   };
 
-  switch (DstTy.getElementType().getSizeInBits()) {
+  switch (EltSize) {
   case 16:
     return BuildFn(AArch64::hsub);
   case 32:
@@ -1957,13 +1959,123 @@
   auto Adrp =
       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
-  auto Load =
-      MIRBuilder.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
-          .addConstantPoolIndex(CPIdx, 0,
-                                AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+  MachineInstr *LoadMI = nullptr;
+  switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
+  case 16:
+    LoadMI =
+        &*MIRBuilder
+              .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
+              .addConstantPoolIndex(CPIdx, 0,
+                                    AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    break;
+  case 8:
+    LoadMI = &*MIRBuilder
+                 .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
+                 .addConstantPoolIndex(
+                     CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+    break;
+  default:
+    LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
+                      << *CPVal->getType());
+    return nullptr;
+  }
   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
-  constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
-  return &*Load;
+  constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
+  return LoadMI;
+}
+
+/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
+/// size and RB.
+static std::pair<unsigned, unsigned>
+getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
+  unsigned Opc, SubregIdx;
+  if (RB.getID() == AArch64::GPRRegBankID) {
+    if (EltSize == 32) {
+      Opc = AArch64::INSvi32gpr;
+      SubregIdx = AArch64::ssub;
+    } else if (EltSize == 64) {
+      Opc = AArch64::INSvi64gpr;
+      SubregIdx = AArch64::dsub;
+    } else {
+      llvm_unreachable("invalid elt size!");
+    }
+  } else {
+    if (EltSize == 8) {
+      Opc = AArch64::INSvi8lane;
+      SubregIdx = AArch64::bsub;
+    } else if (EltSize == 16) {
+      Opc = AArch64::INSvi16lane;
+      SubregIdx = AArch64::hsub;
+    } else if (EltSize == 32) {
+      Opc = AArch64::INSvi32lane;
+      SubregIdx = AArch64::ssub;
+    } else if (EltSize == 64) {
+      Opc = AArch64::INSvi64lane;
+      SubregIdx = AArch64::dsub;
+    } else {
+      llvm_unreachable("invalid elt size!");
+    }
+  }
+  return std::make_pair(Opc, SubregIdx);
+}
+
+MachineInstr *AArch64InstructionSelector::emitVectorConcat(
+    unsigned Op1, unsigned Op2, MachineIRBuilder &MIRBuilder) const {
+  // We implement a vector concat by:
+  // 1. Use scalar_to_vector to insert the lower vector into the larger dest
+  // 2. Insert the upper vector into the destination's upper element
+  // TODO: some of this code is common with G_BUILD_VECTOR handling.
+  MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
+
+  const LLT Op1Ty = MRI.getType(Op1);
+  const LLT Op2Ty = MRI.getType(Op2);
+
+  if (Op1Ty != Op2Ty) {
+    LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
+    return nullptr;
+  }
+  assert(Op1Ty.isVector() && "Expected a vector for vector concat");
+
+  if (Op1Ty.getSizeInBits() >= 128) {
+    LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
+    return nullptr;
+  }
+
+  // At the moment we just support 64 bit vector concats.
+  if (Op1Ty.getSizeInBits() != 64) {
+    LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
+    return nullptr;
+  }
+
+  const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
+  const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
+  const TargetRegisterClass *DstRC =
+      getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
+
+  MachineInstr *WidenedOp1 =
+      emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
+  MachineInstr *WidenedOp2 =
+      emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
+  if (!WidenedOp1 || !WidenedOp2) {
+    LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
+    return nullptr;
+  }
+
+  // Now do the insert of the upper element.
+  unsigned InsertOpc, InsSubRegIdx;
+  std::tie(InsertOpc, InsSubRegIdx) =
+      getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
+
+  auto InsElt =
+      MIRBuilder
+          .buildInstr(InsertOpc, {DstRC}, {WidenedOp1->getOperand(0).getReg()})
+          .addImm(1) /* Lane index */
+          .addUse(WidenedOp2->getOperand(0).getReg())
+          .addImm(0);
+
+  constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
+  return &*InsElt;
 }
 
 bool AArch64InstructionSelector::selectShuffleVector(
@@ -2003,21 +2115,43 @@
     }
   }
 
-  if (DstTy.getSizeInBits() != 128) {
-    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
-    // This case can be done with TBL1.
-    return false;
-  }
+  MachineIRBuilder MIRBuilder(I);
 
   // Use a constant pool to load the index vector for TBL.
   Constant *CPVal = ConstantVector::get(CstIdxs);
-  MachineIRBuilder MIRBuilder(I);
   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
   if (!IndexLoad) {
     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
     return false;
   }
 
+  if (DstTy.getSizeInBits() != 128) {
+    assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
+    // This case can be done with TBL1.
+    MachineInstr *Concat = emitVectorConcat(Src1Reg, Src2Reg, MIRBuilder);
+    if (!Concat) {
+      LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
+      return false;
+    }
+
+    // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
+    IndexLoad =
+        emitScalarToVector(64, &AArch64::FPR128RegClass,
+                           IndexLoad->getOperand(0).getReg(), MIRBuilder);
+
+    auto TBL1 = MIRBuilder.buildInstr(
+        AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
+        {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
+    constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
+
+    auto Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                        TII.get(TargetOpcode::COPY), I.getOperand(0).getReg())
+                    .addUse(TBL1->getOperand(0).getReg(), 0, AArch64::dsub);
+    RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
+    I.eraseFromParent();
+    return true;
+  }
+
   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
   // Q registers for regalloc.
   auto RegSeq = MIRBuilder
@@ -2049,32 +2183,15 @@
   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
   unsigned Opc;
   unsigned SubregIdx;
-  if (RB.getID() == AArch64::GPRRegBankID) {
-    if (EltSize == 32) {
-      Opc = AArch64::INSvi32gpr;
-      SubregIdx = AArch64::ssub;
-    } else {
-      Opc = AArch64::INSvi64gpr;
-      SubregIdx = AArch64::dsub;
-    }
-  } else {
-    if (EltSize == 16) {
-      Opc = AArch64::INSvi16lane;
-      SubregIdx = AArch64::hsub;
-    } else if (EltSize == 32) {
-      Opc = AArch64::INSvi32lane;
-      SubregIdx = AArch64::ssub;
-    } else {
-      Opc = AArch64::INSvi64lane;
-      SubregIdx = AArch64::dsub;
-    }
-  }
+
+  std::tie(Opc, SubregIdx) = getInsertVecEltOpInfo(RB, EltSize);
 
   MachineIRBuilder MIRBuilder(I);
 
   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
   MachineInstr *ScalarToVec =
-      emitScalarToVector(DstTy, DstRC, I.getOperand(1).getReg(), MIRBuilder);
+      emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
+                         I.getOperand(1).getReg(), MIRBuilder);
   if (!ScalarToVec)
     return false;
commit	8acb0d9c82ed49512ba2df6e4a0e3ee65a220fdf	[log] [tgz]
author	Amara Emerson <aemerson@apple.com>	Mon Mar 04 19:16:00 2019 +0000
committer	Amara Emerson <aemerson@apple.com>	Mon Mar 04 19:16:00 2019 +0000
tree	282b6336419bda3f1510960aabf1d156f641f8e3
parent	05e233507697ab81007a6b48728add2ee4627e5b [diff] [blame]