[AMDGPU] Come back patch for the 'Assign register class for cross block values according to the divergence.'
Detailed description:
After https://reviews.llvm.org/D59990 submit several issues were discovered.
Changes in common code were preserved but AMDGPU specific part was reverted to keep the backend working correctly.
Discovered issues were addressed in the following commits:
https://reviews.llvm.org/D67662
https://reviews.llvm.org/D67101
https://reviews.llvm.org/D63953
https://reviews.llvm.org/D63731
This change brings back AMDGPU specific changes.
Reviewed by: rampitec, arsenm
Differential Revision: https://reviews.llvm.org/D68635
llvm-svn: 374767
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 93df7f3..02a0518 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -113,10 +113,16 @@
public:
static char ID;
+ MachineRegisterInfo *MRI;
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+
SIFixSGPRCopies() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
+ void processPHINode(MachineInstr &MI);
+
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -313,52 +319,6 @@
return true;
}
-static bool phiHasVGPROperands(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- const SIRegisterInfo *TRI,
- const SIInstrInfo *TII) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- Register Reg = PHI.getOperand(i).getReg();
- if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
- return true;
- }
- return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- SmallSet<unsigned, 8> &Visited) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- Register Reg = PHI.getOperand(i).getReg();
- if (Visited.count(Reg))
- continue;
-
- Visited.insert(Reg);
-
- MachineInstr *DefInstr = MRI.getVRegDef(Reg);
- switch (DefInstr->getOpcode()) {
- default:
- break;
- case AMDGPU::SI_IF_BREAK:
- return true;
- case AMDGPU::PHI:
- if (phiHasBreakDef(*DefInstr, MRI, Visited))
- return true;
- }
- }
- return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
- const TargetRegisterInfo &TRI) {
- for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
- E = MBB.end(); I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
- return true;
- }
- return false;
-}
-
static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
const MachineInstr *MoveImm,
const SIInstrInfo *TII,
@@ -420,12 +380,6 @@
return false;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
- return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -633,9 +587,9 @@
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
SmallVector<MachineInstr *, 16> Worklist;
@@ -657,7 +611,7 @@
Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
- std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
if (!Register::isVirtualRegister(DstReg)) {
// If the destination register is a physical register there isn't
@@ -666,7 +620,7 @@
// the first lane. Insert a readfirstlane and hope for the best.
if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
Register TmpReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(MBB, MI, MI.getDebugLoc(),
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
@@ -684,7 +638,7 @@
break;
}
- MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+ MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
@@ -703,70 +657,13 @@
break;
}
case AMDGPU::PHI: {
- Register Reg = MI.getOperand(0).getReg();
- if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
- break;
-
- // We don't need to fix the PHI if the common dominator of the
- // two incoming blocks terminates with a uniform branch.
- bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
- if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
- MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
- MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
- if (!predsHasDivergentTerminator(MBB0, TRI) &&
- !predsHasDivergentTerminator(MBB1, TRI)) {
- LLVM_DEBUG(dbgs()
- << "Not fixing PHI for uniform branch: " << MI << '\n');
- break;
- }
- }
-
- // If a PHI node defines an SGPR and any of its operands are VGPRs,
- // then we need to move it to the VALU.
- //
- // Also, if a PHI node defines an SGPR and has all SGPR operands
- // we must move it to the VALU, because the SGPR operands will
- // all end up being assigned the same register, which means
- // there is a potential for a conflict if different threads take
- // different control flow paths.
- //
- // For Example:
- //
- // sgpr0 = def;
- // ...
- // sgpr1 = def;
- // ...
- // sgpr2 = PHI sgpr0, sgpr1
- // use sgpr2;
- //
- // Will Become:
- //
- // sgpr2 = def;
- // ...
- // sgpr2 = def;
- // ...
- // use sgpr2
- //
- // The one exception to this rule is when one of the operands
- // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
- // instruction. In this case, there we know the program will
- // never enter the second block (the loop) without entering
- // the first block (where the condition is computed), so there
- // is no chance for values to be over-written.
-
- SmallSet<unsigned, 8> Visited;
- if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
- LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
- TII->moveToVALU(MI, MDT);
- }
-
+ processPHINode(MI);
break;
}
case AMDGPU::REG_SEQUENCE:
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
- foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
continue;
}
@@ -776,9 +673,9 @@
break;
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
- DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
- Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
- Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+ DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+ Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+ Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
@@ -792,7 +689,78 @@
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
- hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII);
+ hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
return true;
}
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+ unsigned numVGPRUses = 0;
+ SetVector<const MachineInstr *> worklist;
+ worklist.insert(&MI);
+ while (!worklist.empty()) {
+ const MachineInstr *Instr = worklist.pop_back_val();
+ unsigned Reg = Instr->getOperand(0).getReg();
+ for (const auto &Use : MRI->use_operands(Reg)) {
+ const MachineInstr *UseMI = Use.getParent();
+ if (UseMI->isCopy() || UseMI->isRegSequence()) {
+ if (UseMI->isCopy() &&
+ UseMI->getOperand(0).getReg().isPhysical() &&
+ !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+ numVGPRUses++;
+ }
+ worklist.insert(UseMI);
+ continue;
+ }
+
+ if (UseMI->isPHI()) {
+ const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+ if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+ UseRC != &AMDGPU::VReg_1RegClass)
+ numVGPRUses++;
+ continue;
+ }
+
+ const TargetRegisterClass *OpRC =
+ TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+ if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+ OpRC != &AMDGPU::VS_64RegClass) {
+ numVGPRUses++;
+ }
+ }
+ }
+ bool hasVGPRInput = false;
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ unsigned InputReg = MI.getOperand(i).getReg();
+ MachineInstr *Def = MRI->getVRegDef(InputReg);
+ if (TRI->isVGPR(*MRI, InputReg)) {
+ if (Def->isCopy()) {
+ unsigned SrcReg = Def->getOperand(1).getReg();
+ const TargetRegisterClass *RC =
+ TRI->getRegClassForReg(*MRI, SrcReg);
+ if (TRI->isSGPRClass(RC))
+ continue;
+ }
+ hasVGPRInput = true;
+ break;
+ }
+ else if (Def->isCopy() &&
+ TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) {
+ hasVGPRInput = true;
+ break;
+ }
+ }
+ unsigned PHIRes = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+
+ if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
+ (hasVGPRInput || numVGPRUses > 1)) {
+ LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+ TII->moveToVALU(MI);
+ }
+ else {
+ LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+ TII->legalizeOperands(MI, MDT);
+ }
+
+}