[AMDGPU] Come back patch for the 'Assign register class for cross block values according to the divergence.' Detailed description: After https://reviews.llvm.org/D59990 submit several issues were discovered. Changes in common code were preserved but AMDGPU specific part was reverted to keep the backend working correctly. Discovered issues were addressed in the following commits: https://reviews.llvm.org/D67662 https://reviews.llvm.org/D67101 https://reviews.llvm.org/D63953 https://reviews.llvm.org/D63731 This change brings back AMDGPU specific changes. Reviewed by: rampitec, arsenm Differential Revision: https://reviews.llvm.org/D68635 llvm-svn: 374767

commit: c4d256a59049d4b2f21da83f43b9caba2427885e [log] [tgz]
author: Alexander Timofeev <Alexander.Timofeev@amd.com> Mon Oct 14 12:01:10 2019 +0000
committer: Alexander Timofeev <Alexander.Timofeev@amd.com> Mon Oct 14 12:01:10 2019 +0000
tree: 933fb661becfcd18c242cb6dee0829d0b2793a92
parent: d34822ad4c42867ccdd24f8e50d57b37a7969680 [diff]
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 93df7f3..02a0518 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

@@ -113,10 +113,16 @@
 public:
   static char ID;
 
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void processPHINode(MachineInstr &MI);
+
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -313,52 +319,6 @@
   return true;
 }
 
-static bool phiHasVGPROperands(const MachineInstr &PHI,
-                               const MachineRegisterInfo &MRI,
-                               const SIRegisterInfo *TRI,
-                               const SIInstrInfo *TII) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    Register Reg = PHI.getOperand(i).getReg();
-    if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
-      return true;
-  }
-  return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
-                           const MachineRegisterInfo &MRI,
-                           SmallSet<unsigned, 8> &Visited) {
-  for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
-    Register Reg = PHI.getOperand(i).getReg();
-    if (Visited.count(Reg))
-      continue;
-
-    Visited.insert(Reg);
-
-    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-    switch (DefInstr->getOpcode()) {
-    default:
-      break;
-    case AMDGPU::SI_IF_BREAK:
-      return true;
-    case AMDGPU::PHI:
-      if (phiHasBreakDef(*DefInstr, MRI, Visited))
-        return true;
-    }
-  }
-  return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI) {
-  for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
-       E = MBB.end(); I != E; ++I) {
-    if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
-      return true;
-  }
-  return false;
-}
-
 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
                                     const MachineInstr *MoveImm,
                                     const SIInstrInfo *TII,
@@ -420,12 +380,6 @@
   return false;
 }
 
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
-                                        const TargetRegisterInfo *TRI) {
-  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
-           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
 // Checks if there is potential path From instruction To instruction.
 // If CutOff is specified and it sits in between of that path we ignore
 // a higher portion of the path and report it is not reachable.
@@ -633,9 +587,9 @@
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
   SmallVector<MachineInstr *, 16> Worklist;
@@ -657,7 +611,7 @@
         Register DstReg = MI.getOperand(0).getReg();
 
         const TargetRegisterClass *SrcRC, *DstRC;
-        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+        std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
         if (!Register::isVirtualRegister(DstReg)) {
           // If the destination register is a physical register there isn't
@@ -666,7 +620,7 @@
           // the first lane. Insert a readfirstlane and hope for the best.
           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
             Register TmpReg
-              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+              = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
             BuildMI(MBB, MI, MI.getDebugLoc(),
                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
@@ -684,7 +638,7 @@
             break;
           }
 
-          MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
           unsigned SMovOp;
           int64_t Imm;
           // If we are just copying an immediate, we can replace the copy with
@@ -703,70 +657,13 @@
         break;
       }
       case AMDGPU::PHI: {
-        Register Reg = MI.getOperand(0).getReg();
-        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
-          break;
-
-        // We don't need to fix the PHI if the common dominator of the
-        // two incoming blocks terminates with a uniform branch.
-        bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
-          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
-          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
-          if (!predsHasDivergentTerminator(MBB0, TRI) &&
-              !predsHasDivergentTerminator(MBB1, TRI)) {
-            LLVM_DEBUG(dbgs()
-                       << "Not fixing PHI for uniform branch: " << MI << '\n');
-            break;
-          }
-        }
-
-        // If a PHI node defines an SGPR and any of its operands are VGPRs,
-        // then we need to move it to the VALU.
-        //
-        // Also, if a PHI node defines an SGPR and has all SGPR operands
-        // we must move it to the VALU, because the SGPR operands will
-        // all end up being assigned the same register, which means
-        // there is a potential for a conflict if different threads take
-        // different control flow paths.
-        //
-        // For Example:
-        //
-        // sgpr0 = def;
-        // ...
-        // sgpr1 = def;
-        // ...
-        // sgpr2 = PHI sgpr0, sgpr1
-        // use sgpr2;
-        //
-        // Will Become:
-        //
-        // sgpr2 = def;
-        // ...
-        // sgpr2 = def;
-        // ...
-        // use sgpr2
-        //
-        // The one exception to this rule is when one of the operands
-        // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
-        // instruction.  In this case, there we know the program will
-        // never enter the second block (the loop) without entering
-        // the first block (where the condition is computed), so there
-        // is no chance for values to be over-written.
-
-        SmallSet<unsigned, 8> Visited;
-        if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
-          LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
-          TII->moveToVALU(MI, MDT);
-        }
-
+        processPHINode(MI);
         break;
       }
       case AMDGPU::REG_SEQUENCE:
         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
             !hasVectorOperands(MI, TRI)) {
-          foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+          foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
           continue;
         }
 
@@ -776,9 +673,9 @@
         break;
       case AMDGPU::INSERT_SUBREG: {
         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
-        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
-        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
-        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
         if (TRI->isSGPRClass(DstRC) &&
             (TRI->hasVectorRegisters(Src0RC) ||
              TRI->hasVectorRegisters(Src1RC))) {
@@ -792,7 +689,78 @@
   }
 
   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
-    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, TRI, *MDT, TII);
+    hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 
   return true;
 }
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+  unsigned numVGPRUses = 0;
+  SetVector<const MachineInstr *> worklist;
+  worklist.insert(&MI);
+  while (!worklist.empty()) {
+    const MachineInstr *Instr = worklist.pop_back_val();
+    unsigned Reg = Instr->getOperand(0).getReg();
+    for (const auto &Use : MRI->use_operands(Reg)) {
+      const MachineInstr *UseMI = Use.getParent();
+      if (UseMI->isCopy() || UseMI->isRegSequence()) {
+        if (UseMI->isCopy() &&
+          UseMI->getOperand(0).getReg().isPhysical() &&
+          !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+          numVGPRUses++;
+        }
+        worklist.insert(UseMI);
+        continue;
+      }
+
+      if (UseMI->isPHI()) {
+        const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+        if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+          UseRC != &AMDGPU::VReg_1RegClass)
+          numVGPRUses++;
+        continue;
+      }
+
+      const TargetRegisterClass *OpRC =
+        TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+      if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+        OpRC != &AMDGPU::VS_64RegClass) {
+        numVGPRUses++;
+      }
+    }
+  }
+  bool hasVGPRInput = false;
+  for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+    unsigned InputReg = MI.getOperand(i).getReg();
+    MachineInstr *Def = MRI->getVRegDef(InputReg);
+    if (TRI->isVGPR(*MRI, InputReg)) {
+      if (Def->isCopy()) {
+        unsigned SrcReg = Def->getOperand(1).getReg();
+        const TargetRegisterClass *RC =
+          TRI->getRegClassForReg(*MRI, SrcReg);
+        if (TRI->isSGPRClass(RC))
+          continue;
+      }
+      hasVGPRInput = true;
+      break;
+    }
+    else if (Def->isCopy() &&
+      TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) {
+      hasVGPRInput = true;
+      break;
+    }
+  }
+  unsigned PHIRes = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+
+  if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
+    (hasVGPRInput || numVGPRUses > 1)) {
+    LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+    TII->moveToVALU(MI);
+  }
+  else {
+    LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+    TII->legalizeOperands(MI, MDT);
+  }
+
+}

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ba637d4..1809817 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -10944,3 +10944,110 @@
 
   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
 }
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+    return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
+                                               : &AMDGPU::SReg_32RegClass;
+  if (!TRI->isSGPRClass(RC) && !isDivergent)
+    return TRI->getEquivalentSGPRClass(RC);
+  else if (TRI->isSGPRClass(RC) && isDivergent)
+    return TRI->getEquivalentVGPRClass(RC);
+
+  return RC;
+}
+
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+  if (!Visited.insert(V).second)
+    return false;
+  bool Result = false;
+  for (auto U : V->users()) {
+    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+      if (V == U->getOperand(1)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_if_break:
+        case Intrinsic::amdgcn_if:
+        case Intrinsic::amdgcn_else:
+          Result = true;
+          break;
+        }
+      }
+      if (V == U->getOperand(0)) {
+        switch (Intrinsic->getIntrinsicID()) {
+        default:
+          Result = false;
+          break;
+        case Intrinsic::amdgcn_end_cf:
+        case Intrinsic::amdgcn_loop:
+          Result = true;
+          break;
+        }
+      }
+    } else {
+      Result = hasCFUser(U, Visited);
+    }
+    if (Result)
+      break;
+  }
+  return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+                                               const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if_break:
+      return true;
+    }
+  }
+  if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+    if (const IntrinsicInst *Intrinsic =
+            dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        return false;
+      case Intrinsic::amdgcn_if:
+      case Intrinsic::amdgcn_else: {
+        ArrayRef<unsigned> Indices = ExtValue->getIndices();
+        if (Indices.size() == 1 && Indices[0] == 1) {
+          return true;
+        }
+      }
+      }
+    }
+  }
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (isa<InlineAsm>(CI->getCalledValue())) {
+      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+      ImmutableCallSite CS(CI);
+      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+          MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+      for (auto &TC : TargetConstraints) {
+        if (TC.Type == InlineAsm::isOutput) {
+          ComputeConstraintToUse(TC, SDValue());
+          unsigned AssignedReg;
+          const TargetRegisterClass *RC;
+          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+              SIRI, TC.ConstraintCode, TC.ConstraintVT);
+          if (RC) {
+            MachineRegisterInfo &MRI = MF.getRegInfo();
+            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+              return true;
+            else if (SIRI->isSGPRClass(RC))
+              return true;
+          }
+        }
+      }
+    }
+  }
+  SmallPtrSet<const Value *, 16> Visited;
+  return hasCFUser(V, Visited);
+}

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 8198420..f0102fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h

@@ -388,6 +388,10 @@
                                     unsigned Depth = 0) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
 
+  virtual const TargetRegisterClass *
+  getRegClassFor(MVT VT, bool isDivergent) const override;
+  virtual bool requiresUniformRegister(MachineFunction &MF,
+                                       const Value *V) const override;
   Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
   void allocateHSAUserSGPRs(CCState &CCInfo,

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index db33c3e..683292f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

@@ -4222,7 +4222,7 @@
     return;
 
   // Try to eliminate the copy if it is copying an immediate value.
-  if (Def->isMoveImmediate())
+  if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
     FoldImmediate(*Copy, *Def, OpReg, &MRI);
 
   bool ImpDef = Def->isImplicitDef();
@@ -4480,8 +4480,12 @@
     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
       if (!VRC) {
         assert(SRC);
-        VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
-                                                : RI.getEquivalentVGPRClass(SRC);
+        if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
+          VRC = &AMDGPU::VReg_1RegClass;
+        } else
+          VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+                    ? RI.getEquivalentAGPRClass(SRC)
+                    : RI.getEquivalentVGPRClass(SRC);
       }
       RC = VRC;
     } else {
@@ -5679,7 +5683,7 @@
       if (!NewDstRC)
         return nullptr;
     } else {
-       if (RI.hasVGPRs(NewDstRC))
+      if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
         return nullptr;
 
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
commit	c4d256a59049d4b2f21da83f43b9caba2427885e	[log] [tgz]
author	Alexander Timofeev <Alexander.Timofeev@amd.com>	Mon Oct 14 12:01:10 2019 +0000
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>	Mon Oct 14 12:01:10 2019 +0000
tree	933fb661becfcd18c242cb6dee0829d0b2793a92
parent	d34822ad4c42867ccdd24f8e50d57b37a7969680 [diff]