AMDGPU: Separate R600 and GCN TableGen files
Summary:
We now have two sets of generated TableGen files, one for R600 and one
for GCN, so each sub-target now has its own tables of instructions,
registers, ISel patterns, etc. This should help reduce compile time
since each sub-target now only has to consider information that
is specific to itself. This will also help prevent the R600
sub-target from slowing down new features for GCN, like disassembler
support, GlobalISel, etc.
Reviewers: arsenm, nhaehnle, jvesely
Reviewed By: arsenm
Subscribers: MatzeB, kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D46365
llvm-svn: 335942
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92d7198..997b41e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -14,6 +14,7 @@
#include "SIInstrInfo.h"
#include "AMDGPU.h"
+#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "GCNHazardRecognizer.h"
#include "SIDefines.h"
@@ -63,6 +64,19 @@
using namespace llvm;
+#define GET_INSTRINFO_CTOR_DTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+namespace llvm {
+namespace AMDGPU {
+#define GET_D16ImageDimIntrinsics_IMPL
+#define GET_ImageDimIntrinsicTable_IMPL
+#define GET_RsrcIntrinsics_IMPL
+#include "AMDGPUGenSearchableTables.inc"
+}
+}
+
+
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
// long branches.
@@ -71,7 +85,8 @@
cl::desc("Restrict range of branch instructions (DEBUG)"));
SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
- : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
+ : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
+ RI(ST), ST(ST) {}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
@@ -438,6 +453,28 @@
return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
}
+// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
+// the first 16 loads will be interleaved with the stores, and the next 16 will
+// be clustered as expected. It should really split into 2 16 store batches.
+//
+// Loads are clustered until this returns false, rather than trying to schedule
+// groups of stores. This also means we have to deal with saying different
+// address space loads should be clustered, and ones which might cause bank
+// conflicts.
+//
+// This might be deprecated so it might not be worth that much effort to fix.
+bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
+ int64_t Offset0, int64_t Offset1,
+ unsigned NumLoads) const {
+ assert(Offset1 > Offset0 &&
+ "Second offset should be larger than first offset!");
+ // If we have less than 16 loads in a row, and the offsets are within 64
+ // bytes, then schedule together.
+
+ // A cacheline is 64 bytes (for global memory).
+ return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
+}
+
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
@@ -998,7 +1035,7 @@
unsigned FrameOffset, unsigned Size) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+ const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
DebugLoc DL = MBB.findDebugLoc(MI);
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
unsigned WavefrontSize = ST.getWavefrontSize();
@@ -1134,7 +1171,7 @@
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
- default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+ default: return TargetInstrInfo::expandPostRAPseudo(MI);
case AMDGPU::S_MOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
@@ -1900,16 +1937,16 @@
switch(Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
- return AMDGPUASI.PRIVATE_ADDRESS;
+ return ST.getAMDGPUAS().PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
case PseudoSourceValue::TargetCustom:
- return AMDGPUASI.CONSTANT_ADDRESS;
+ return ST.getAMDGPUAS().CONSTANT_ADDRESS;
}
- return AMDGPUASI.FLAT_ADDRESS;
+ return ST.getAMDGPUAS().FLAT_ADDRESS;
}
static void removeModOperands(MachineInstr &MI) {
@@ -4649,7 +4686,7 @@
return AMDGPU::NoRegister;
assert(!MI.memoperands_empty() &&
- (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
+ (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS);
FrameIndex = Addr->getIndex();
return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -4768,7 +4805,7 @@
return true;
for (const MachineMemOperand *MMO : MI.memoperands()) {
- if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
+ if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS)
return true;
}
return false;
@@ -4948,3 +4985,55 @@
const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
return RCID == AMDGPU::SReg_128RegClassID;
}
+
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+enum SIEncodingFamily {
+ SI = 0,
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3,
+ GFX80 = 4,
+ GFX9 = 5
+};
+
+static SIEncodingFamily subtargetEncodingFamily(const SISubtarget &ST) {
+ switch (ST.getGeneration()) {
+ case SISubtarget::SOUTHERN_ISLANDS:
+ case SISubtarget::SEA_ISLANDS:
+ return SIEncodingFamily::SI;
+ case SISubtarget::VOLCANIC_ISLANDS:
+ case SISubtarget::GFX9:
+ return SIEncodingFamily::VI;
+ }
+ llvm_unreachable("Unknown subtarget generation!");
+}
+
+int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+
+ if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
+ ST.getGeneration() >= SISubtarget::GFX9)
+ Gen = SIEncodingFamily::GFX9;
+
+ if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+ Gen = ST.getGeneration() == SISubtarget::GFX9 ? SIEncodingFamily::SDWA9
+ : SIEncodingFamily::SDWA;
+ // Adjust the encoding family to GFX80 for D16 buffer instructions when the
+ // subtarget has UnpackedD16VMem feature.
+ // TODO: remove this when we discard GFX80 encoding.
+ if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
+ Gen = SIEncodingFamily::GFX80;
+
+ int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+
+ // -1 means that Opcode is already a native instruction.
+ if (MCOp == -1)
+ return Opcode;
+
+ // (uint16_t)-1 means that Opcode is a pseudo instruction that has
+ // no encoding in the given subtarget generation.
+ if (MCOp == (uint16_t)-1)
+ return -1;
+
+ return MCOp;
+}