[Power9] Exploit move and splat instructions for build_vector improvement
This patch corresponds to review:
https://reviews.llvm.org/D21135
This patch exploits the following instructions:
mtvsrws
lxvwsx
mtvsrdd
mfvsrld
In order to improve some build_vector and extractelement patterns.
llvm-svn: 282246
diff --git a/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 8190f31..9513fd3 100644
--- a/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -328,10 +328,12 @@
O << (unsigned int)Value;
}
+// Operands of BUILD_VECTOR are signed and we use this to print operands
+// of XXSPLTIB which are unsigned. So we simply truncate to 8 bits and
+// print as unsigned.
void PPCInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
- unsigned int Value = MI->getOperand(OpNo).getImm();
- assert(Value <= 255 && "Invalid u8imm argument!");
+ unsigned char Value = MI->getOperand(OpNo).getImm();
O << (unsigned int)Value;
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index c414a15..5bce336 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -672,6 +672,9 @@
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
}
+
+ if (Subtarget.isISA3_0() && Subtarget.hasDirectMove())
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Legal);
}
if (Subtarget.hasQPX()) {
@@ -7079,6 +7082,16 @@
return DAG.getNode(ISD::BITCAST, dl, VT, T);
}
+static bool isNonConstSplatBV(BuildVectorSDNode *BVN, EVT Type) {
+ if (BVN->getValueType(0) != Type)
+ return false;
+ auto OpZero = BVN->getOperand(0);
+ for (int i = 1, e = BVN->getNumOperands(); i < e; i++)
+ if (BVN->getOperand(i) != OpZero)
+ return false;
+ return true;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
@@ -7200,8 +7213,17 @@
bool HasAnyUndefs;
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
- SplatBitSize > 32)
+ SplatBitSize > 32) {
+ // We can splat a non-const value on CPU's that implement ISA 3.0
+ // in two ways: LXVWSX (load and splat) and MTVSRWS(move and splat).
+ auto OpZero = BVN->getOperand(0);
+ bool CanLoadAndSplat = OpZero.getOpcode() == ISD::LOAD &&
+ BVN->isOnlyUserOf(OpZero.getNode());
+ if (Subtarget.isISA3_0() &&
+ isNonConstSplatBV(BVN, MVT::v4i32) && !CanLoadAndSplat)
+ return Op;
return SDValue();
+ }
unsigned SplatBits = APSplatBits.getZExtValue();
unsigned SplatUndef = APSplatUndef.getZExtValue();
@@ -7219,6 +7241,10 @@
return Op;
}
+ // We have XXSPLTIB for constant splats one byte wide
+ if (Subtarget.isISA3_0() && Op.getValueType() == MVT::v16i8)
+ return Op;
+
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
(32-SplatBitSize));
@@ -7462,6 +7488,18 @@
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
+
+ // If the source for the shuffle is a scalar_to_vector that came from a
+ // 32-bit load, it will have used LXVWSX so we don't need to splat again.
+ if (Subtarget.isISA3_0() &&
+ ((isLittleEndian && SplatIdx == 3) ||
+ (!isLittleEndian && SplatIdx == 0))) {
+ SDValue Src = V1.getOperand(0);
+ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ Src.getOperand(0).getOpcode() == ISD::LOAD &&
+ Src.getOperand(0).hasOneUse())
+ return V1;
+ }
SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
DAG.getConstant(SplatIdx, dl, MVT::i32));
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 163c5d0..e67dfe2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -1059,6 +1059,13 @@
let Inst{31} = XT{5};
}
+class XX3Form_Zero<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
+ InstrItinClass itin, list<dag> pattern>
+ : XX3Form<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+ let XA = XT;
+ let XB = XT;
+}
+
class XX3Form_1<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
InstrItinClass itin, list<dag> pattern>
: I<opcode, OOL, IOL, asmstr, itin> {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 7939bc8..51e1a7e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -312,6 +312,7 @@
// field. Used by instructions like 'ori'.
return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
}], LO16>;
+def immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
// imm16Shifted* - These match immediates where the low 16-bits are zero. There
// are two forms: imm16ShiftedSExt and imm16ShiftedZExt. These two forms are
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 7d1cf8e..f461b1a6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -767,6 +767,10 @@
"xxlxor $XT, $XA, $XB", IIC_VecGeneral,
[(set v4i32:$XT, (xor v4i32:$XA, v4i32:$XB))]>;
} // isCommutable
+ let isCodeGenOnly = 1 in
+ def XXLXORz : XX3Form_Zero<60, 154, (outs vsrc:$XT), (ins),
+ "xxlxor $XT, $XT, $XT", IIC_VecGeneral,
+ [(set v4i32:$XT, (v4i32 immAllZerosV))]>;
// Permutation Instructions
def XXMRGHW : XX3Form<60, 18,
@@ -1315,8 +1319,7 @@
let Predicates = [IsISA3_0, HasDirectMove] in {
def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
- "mtvsrws $XT, $rA", IIC_VecGeneral,
- []>;
+ "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
"mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
@@ -1880,6 +1883,10 @@
dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
}
+// Materialize a zero-vector of long long
+def : Pat<(v2i64 immAllZerosV),
+ (v2i64 (XXLXORz))>;
+
// The following VSX instructions were introduced in Power ISA 3.0
def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
let AddedComplexity = 400, Predicates = [HasP9Vector] in {
@@ -2310,4 +2317,40 @@
(STXVX $rS, xoaddr:$dst)>;
def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
(STXVX $rS, xoaddr:$dst)>;
+
+ def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
+ (v4i32 (LXVWSX xoaddr:$src))>;
+ def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
+ (v4f32 (LXVWSX xoaddr:$src))>;
+ def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+ (v4i32 (MTVSRWS $A))>;
+ def : Pat<(v16i8 (build_vector immSExt8:$A, immSExt8:$A, immSExt8:$A,
+ immSExt8:$A, immSExt8:$A, immSExt8:$A,
+ immSExt8:$A, immSExt8:$A, immSExt8:$A,
+ immSExt8:$A, immSExt8:$A, immSExt8:$A,
+ immSExt8:$A, immSExt8:$A, immSExt8:$A,
+ immSExt8:$A)),
+ (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
+ def : Pat<(v16i8 immAllOnesV),
+ (v16i8 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+ def : Pat<(v8i16 immAllOnesV),
+ (v8i16 (COPY_TO_REGCLASS (XXSPLTIB 255), VSRC))>;
+ def : Pat<(v4i32 immAllOnesV),
+ (v4i32 (XXSPLTIB 255))>;
+ def : Pat<(v2i64 immAllOnesV),
+ (v2i64 (XXSPLTIB 255))>;
} // end HasP9Vector, AddedComplexity
+
+let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
+def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
+ (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(i64 (extractelt v2i64:$A, 0)),
+ (i64 (MFVSRLD $A))>;
+}
+
+let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
+def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
+ (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(i64 (extractelt v2i64:$A, 1)),
+ (i64 (MFVSRLD $A))>;
+}