[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.

Currently, there are substantial problems forming vld1_dup even if the
VDUP survives legalization. The lack of an actual node
leads to terrible results: not only can we not form post-increment vld1_dup
instructions, but we form scalar pre-increment and post-increment
loads which force the loaded value into a GPR. This patch fixes that
by combining the vdup+load into an ARMISD node before DAGCombine
messes it up.

Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable).

Recommiting with fix to avoid forming vld1dup if the type of the load
doesn't match the type of the vdup (see
https://llvm.org/bugs/show_bug.cgi?id=31404).

Differential Revision: https://reviews.llvm.org/D27694

llvm-svn: 289972
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 3bc3cac..c3e9591 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -222,10 +222,11 @@
                        const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
-  /// should be 2, 3 or 4.  The opcode array specifies the instructions used
+  /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
   void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcodes);
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes = nullptr);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
@@ -1762,6 +1763,12 @@
   case ARM::VLD1q16wb_fixed : return true;
   case ARM::VLD1q32wb_fixed : return true;
   case ARM::VLD1q64wb_fixed : return true;
+  case ARM::VLD1DUPd8wb_fixed : return true;
+  case ARM::VLD1DUPd16wb_fixed : return true;
+  case ARM::VLD1DUPd32wb_fixed : return true;
+  case ARM::VLD1DUPq8wb_fixed : return true;
+  case ARM::VLD1DUPq16wb_fixed : return true;
+  case ARM::VLD1DUPq32wb_fixed : return true;
   case ARM::VLD2d8wb_fixed : return true;
   case ARM::VLD2d16wb_fixed : return true;
   case ARM::VLD2d32wb_fixed : return true;
@@ -1816,6 +1823,12 @@
   case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register;
   case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register;
   case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register;
+  case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register;
+  case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register;
+  case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register;
+  case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register;
+  case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register;
+  case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register;
 
   case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
   case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
@@ -2256,8 +2269,9 @@
 }
 
 void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *Opcodes) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
@@ -2285,19 +2299,21 @@
   }
   Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32);
 
-  unsigned OpcodeIndex;
+  unsigned Opc;
   switch (VT.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("unhandled vld-dup type");
-  case MVT::v8i8:  OpcodeIndex = 0; break;
-  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v8i8:  Opc = DOpcodes[0]; break;
+  case MVT::v16i8: Opc = QOpcodes[0]; break;
+  case MVT::v4i16: Opc = DOpcodes[1]; break;
+  case MVT::v8i16: Opc = QOpcodes[1]; break;
   case MVT::v2f32:
-  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v2i32: Opc = DOpcodes[2]; break;
+  case MVT::v4f32:
+  case MVT::v4i32: Opc = QOpcodes[2]; break;
   }
 
   SDValue Pred = getAL(CurDAG, dl);
   SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-  SDValue SuperReg;
-  unsigned Opc = Opcodes[OpcodeIndex];
   SmallVector<SDValue, 6> Ops;
   Ops.push_back(MemAddr);
   Ops.push_back(Align);
@@ -2305,6 +2321,8 @@
     // fixed-stride update instructions don't have an explicit writeback
     // operand. It's implicit in the opcode itself.
     SDValue Inc = N->getOperand(2);
+    if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
     if (!isa<ConstantSDNode>(Inc.getNode()))
       Ops.push_back(Inc);
     // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
@@ -2323,14 +2341,18 @@
   ResTys.push_back(MVT::Other);
   SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
-  SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
-  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
-  unsigned SubIdx = ARM::dsub_0;
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0));
+  } else {
+    SDValue SuperReg = SDValue(VLdDup, 0);
+    static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
+    unsigned SubIdx = ARM::dsub_0;
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      ReplaceUses(SDValue(N, Vec),
+                  CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  }
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
@@ -3402,6 +3424,15 @@
     return;
   }
 
+  case ARMISD::VLD1DUP: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16,
+                                         ARM::VLD1DUPd32 };
+    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16,
+                                         ARM::VLD1DUPq32 };
+    SelectVLDDup(N, false, 1, DOpcodes, QOpcodes);
+    return;
+  }
+
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
@@ -3425,6 +3456,17 @@
     return;
   }
 
+  case ARMISD::VLD1DUP_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed,
+                                         ARM::VLD1DUPd16wb_fixed,
+                                         ARM::VLD1DUPd32wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed,
+                                         ARM::VLD1DUPq16wb_fixed,
+                                         ARM::VLD1DUPq32wb_fixed };
+    SelectVLDDup(N, true, 1, DOpcodes, QOpcodes);
+    return;
+  }
+
   case ARMISD::VLD2DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 30a27f3..2c88c75 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1428,6 +1428,7 @@
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
   case ARMISD::VBSL:          return "ARMISD::VBSL";
   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
+  case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
@@ -1438,6 +1439,7 @@
   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
@@ -10473,6 +10475,7 @@
       isLaneOp = true;
       switch (N->getOpcode()) {
       default: llvm_unreachable("unexpected opcode for Neon base update");
+      case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
@@ -10587,8 +10590,8 @@
       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
     }
 
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
-                                           Ops, AlignedVecTy,
+    EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
                                            MemN->getMemOperand());
 
     // Update the uses.
@@ -10733,6 +10736,31 @@
   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }
 
+/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
+static SDValue PerformVDUPCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+
+  // Match VDUP(LOAD) -> VLD1DUP.
+  // We match this pattern here rather than waiting for isel because the
+  // transform is only legal for unindexed loads.
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
+  if (LD && Op.hasOneUse() && LD->isUnindexed() &&
+      LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
+    SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
+                      DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
+    SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
+                                             Ops, LD->getMemoryVT(),
+                                             LD->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
+    return VLDDup;
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformLOADCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
@@ -11560,6 +11588,7 @@
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@@ -11575,6 +11604,7 @@
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
+  case ARMISD::VLD1DUP:
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
   case ARMISD::VLD4DUP:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 7583b53..6348edb 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -190,7 +190,8 @@
       MEMCPY,
 
       // Vector load N-element structure to all lanes:
-      VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD2DUP,
       VLD3DUP,
       VLD4DUP,
 
@@ -202,6 +203,7 @@
       VLD2LN_UPD,
       VLD3LN_UPD,
       VLD4LN_UPD,
+      VLD1DUP_UPD,
       VLD2DUP_UPD,
       VLD3DUP_UPD,
       VLD4DUP_UPD,