[ARM] Add ARMISD::VLD1DUP to match vld1_dup more consistently.
Currently, there are substantial problems forming vld1_dup even if the
VDUP survives legalization. The lack of an actual node
leads to terrible results: not only can we not form post-increment vld1_dup
instructions, but we form scalar pre-increment and post-increment
loads which force the loaded value into a GPR. This patch fixes that
by combining the vdup+load into an ARMISD node before DAGCombine
messes it up.
Also includes a crash fix for vld2_dup (see testcase @vld2dupi8_postinc_variable).
Recommiting with fix to avoid forming vld1dup if the type of the load
doesn't match the type of the vdup (see
https://llvm.org/bugs/show_bug.cgi?id=31404).
Differential Revision: https://reviews.llvm.org/D27694
llvm-svn: 289972
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 30a27f3..2c88c75 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1428,6 +1428,7 @@
case ARMISD::VBICIMM: return "ARMISD::VBICIMM";
case ARMISD::VBSL: return "ARMISD::VBSL";
case ARMISD::MEMCPY: return "ARMISD::MEMCPY";
+ case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP";
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
@@ -1438,6 +1439,7 @@
case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
+ case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD";
case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
@@ -10473,6 +10475,7 @@
isLaneOp = true;
switch (N->getOpcode()) {
default: llvm_unreachable("unexpected opcode for Neon base update");
+ case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
@@ -10587,8 +10590,8 @@
StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
}
- SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys,
- Ops, AlignedVecTy,
+ EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
MemN->getMemOperand());
// Update the uses.
@@ -10733,6 +10736,31 @@
return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
}
+/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
+static SDValue PerformVDUPCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Op = N->getOperand(0);
+
+ // Match VDUP(LOAD) -> VLD1DUP.
+ // We match this pattern here rather than waiting for isel because the
+ // transform is only legal for unindexed loads.
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
+ if (LD && Op.hasOneUse() && LD->isUnindexed() &&
+ LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
+ SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
+ DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
+ SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
+ SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
+ Ops, LD->getMemoryVT(),
+ LD->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
+ return VLDDup;
+ }
+
+ return SDValue();
+}
+
static SDValue PerformLOADCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
@@ -11560,6 +11588,7 @@
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
@@ -11575,6 +11604,7 @@
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);
+ case ARMISD::VLD1DUP:
case ARMISD::VLD2DUP:
case ARMISD::VLD3DUP:
case ARMISD::VLD4DUP: