[Power9] Add exploitation of non-permuting memory ops
This patch corresponds to review:
https://reviews.llvm.org/D19825
The new lxvx/stxvx instructions do not require the swaps to line the elements
up correctly. In order to select them over the lxvd2x/lxvw4x instructions which
require swaps, the patterns for the old instruction have a predicate that
ensures they won't be selected on Power9 and newer CPUs.
llvm-svn: 282143
diff --git a/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 61ad570..8190f31 100644
--- a/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -33,6 +33,11 @@
FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
cl::desc("Use full register names when printing assembly"));
+// Useful for testing purposes. Prints vs{31-63} as v{0-31} respectively.
+static cl::opt<bool>
+ShowVSRNumsAsVR("ppc-vsr-nums-as-vr", cl::Hidden, cl::init(false),
+ cl::desc("Prints full register names with vs{31-63} as v{0-31}"));
+
#define PRINT_ALIAS_INSTR
#include "PPCGenAsmWriter.inc"
@@ -462,6 +467,14 @@
const MCOperand &Op = MI->getOperand(OpNo);
if (Op.isReg()) {
const char *RegName = getRegisterName(Op.getReg());
+ if (ShowVSRNumsAsVR) {
+ unsigned RegNum = Op.getReg();
+ if (RegNum >= PPC::VSH0 && RegNum <= PPC::VSH31)
+ O << 'v' << RegNum - PPC::VSH0;
+ else
+ O << RegName;
+ return;
+ }
// The linux and AIX assembler does not take register prefixes.
if (!isDarwinSyntax())
RegName = stripRegisterPrefix(RegName);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cd75474..1d9181b 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -10734,10 +10734,11 @@
}
// For little endian, VSX stores require generating xxswapd/lxvd2x.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
EVT VT = N->getOperand(1).getValueType();
if (VT.isSimple()) {
MVT StoreVT = VT.getSimpleVT();
- if (Subtarget.hasVSX() && Subtarget.isLittleEndian() &&
+ if (Subtarget.needsSwapsForVSXMemOps() &&
(StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
return expandVSXStoreForLE(N, DCI);
@@ -10749,9 +10750,10 @@
EVT VT = LD->getValueType(0);
// For little endian, VSX loads require generating lxvd2x/xxswapd.
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
if (VT.isSimple()) {
MVT LoadVT = VT.getSimpleVT();
- if (Subtarget.hasVSX() && Subtarget.isLittleEndian() &&
+ if (Subtarget.needsSwapsForVSXMemOps() &&
(LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
return expandVSXLoadForLE(N, DCI);
@@ -11066,7 +11068,8 @@
break;
case ISD::INTRINSIC_W_CHAIN: {
// For little endian, VSX loads require generating lxvd2x/xxswapd.
- if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) {
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
+ if (Subtarget.needsSwapsForVSXMemOps()) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default:
break;
@@ -11079,7 +11082,8 @@
}
case ISD::INTRINSIC_VOID: {
// For little endian, VSX stores require generating xxswapd/stxvd2x.
- if (Subtarget.hasVSX() && Subtarget.isLittleEndian()) {
+ // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
+ if (Subtarget.needsSwapsForVSXMemOps()) {
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
default:
break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 8cbd71e..88915a5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -273,6 +273,7 @@
case PPC::RESTORE_CRBIT:
case PPC::LVX:
case PPC::LXVD2X:
+ case PPC::LXVX:
case PPC::QVLFDX:
case PPC::QVLFSXs:
case PPC::QVLFDXb:
@@ -302,6 +303,7 @@
case PPC::SPILL_CRBIT:
case PPC::STVX:
case PPC::STXVD2X:
+ case PPC::STXVX:
case PPC::QVSTFDX:
case PPC::QVSTFSXs:
case PPC::QVSTFDXb:
@@ -1008,7 +1010,8 @@
FrameIdx));
NonRI = true;
} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STXVD2X))
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::STXVX : PPC::STXVD2X;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op))
.addReg(SrcReg,
getKillRegState(isKill)),
FrameIdx));
@@ -1129,7 +1132,8 @@
FrameIdx));
NonRI = true;
} else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
- NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LXVD2X), DestReg),
+ unsigned Op = Subtarget.hasP9Vector() ? PPC::LXVX : PPC::LXVD2X;
+ NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Op), DestReg),
FrameIdx));
NonRI = true;
} else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index e229abd..111d8fc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -92,6 +92,7 @@
def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
+def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;
let Predicates = [HasVSX] in {
let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
@@ -105,6 +106,7 @@
"lxsdx $XT, $src", IIC_LdStLFD,
[(set f64:$XT, (load xoaddr:$src))]>;
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in
def LXVD2X : XX1Form<31, 844,
(outs vsrc:$XT), (ins memrr:$src),
"lxvd2x $XT, $src", IIC_LdStLFD,
@@ -114,6 +116,7 @@
(outs vsrc:$XT), (ins memrr:$src),
"lxvdsx $XT, $src", IIC_LdStLFD, []>;
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in
def LXVW4X : XX1Form<31, 780,
(outs vsrc:$XT), (ins memrr:$src),
"lxvw4x $XT, $src", IIC_LdStLFD,
@@ -127,6 +130,7 @@
"stxsdx $XT, $dst", IIC_LdStSTFD,
[(store f64:$XT, xoaddr:$dst)]>;
+ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
def STXVD2X : XX1Form<31, 972,
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvd2x $XT, $dst", IIC_LdStSTFD,
@@ -136,7 +140,7 @@
(outs), (ins vsrc:$XT, memrr:$dst),
"stxvw4x $XT, $dst", IIC_LdStSTFD,
[(store v4i32:$XT, xoaddr:$dst)]>;
-
+ }
} // mayStore
// Add/Mul Instructions
@@ -948,18 +952,20 @@
(v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
// Loads.
-def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+ def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+ def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+ def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-// Stores.
-def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
- (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
- (STXVW4X $rS, xoaddr:$dst)>;
-def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ // Stores.
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+ (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+ (STXVW4X $rS, xoaddr:$dst)>;
+ def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+}
// Permutes.
def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
@@ -2185,7 +2191,8 @@
def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
// Load Vector Indexed
- def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc, []>;
+ def LXVX : X_XT6_RA5_RB5<31, 268, "lxvx" , vsrc,
+ [(set v2f64:$XT, (load xoaddr:$src))]>;
// Load Vector (Left-justified) with Length
def LXVL : X_XT6_RA5_RB5<31, 269, "lxvl" , vsrc, []>;
@@ -2221,7 +2228,8 @@
def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
// Store Vector Indexed
- def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc, []>;
+ def STXVX : X_XS6_RA5_RB5<31, 396, "stxvx" , vsrc,
+ [(store v2f64:$XT, xoaddr:$dst)]>;
// Store Vector (Left-justified) with Length
def STXVL : X_XS6_RA5_RB5<31, 397, "stxvl" , vsrc, []>;
@@ -2282,4 +2290,19 @@
def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
(v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
} // IsLittleEndian, HasP9Vector
+
+ def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+ def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+ (STXVX $rS, xoaddr:$dst)>;
+ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+ (STXVX $rS, xoaddr:$dst)>;
} // end HasP9Vector, AddedComplexity
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index f58c7c1..d80a9ad 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -277,6 +277,9 @@
bool hasFloat128() const { return HasFloat128; }
bool isISA3_0() const { return IsISA3_0; }
bool useLongCalls() const { return UseLongCalls; }
+ bool needsSwapsForVSXMemOps() const {
+ return hasVSX() && isLittleEndian() && !hasP9Vector();
+ }
POPCNTDKind hasPOPCNTD() const { return HasPOPCNTD; }