AMDGPU: Write LDS objects out as global symbols in code generation

Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.

Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.

Some notes:

- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
  to a constant at compile times, which means some tests can no longer
  be applied.

  The current "solution" is a terrible hack, but the intrinsic isn't
  used by Mesa, so we can keep it for now.

- We no longer know the full LDS size per kernel at compile time, which
  means that we can no longer generate a relevant error message at
  compile time. It would be possible to add a check for the size of
  individual variables, but ultimately the linker will have to perform
  the final check.

Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275

Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin

Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D61494

llvm-svn: 364297
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 5578251..bad5670 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -298,10 +298,37 @@
 }
 
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+      OutContext.reportError({},
+                             Twine(GV->getName()) +
+                                 ": unsupported initializer for address space");
+      return;
+    }
 
-  // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+    // LDS variables aren't emitted in HSA or PAL yet.
+    const Triple::OSType OS = TM.getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return;
+
+    MCSymbol *GVSym = getSymbol(GV);
+
+    GVSym->redefineIfPossible();
+    if (GVSym->isDefined() || GVSym->isVariable())
+      report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                         "' is already defined");
+
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+    unsigned Align = GV->getAlignment();
+    if (!Align)
+      Align = 4;
+
+    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+    EmitLinkage(GV, GVSym);
+    getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align);
     return;
+  }
 
   AsmPrinter::EmitGlobalVariable(GV);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1197893..d0af336 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4357,6 +4357,7 @@
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+  NODE_NAME_CASE(LDS)
   NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
@@ -4571,6 +4572,15 @@
     Known.Zero.setHighBits(16);
     break;
   }
+  case AMDGPUISD::LDS: {
+    auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
+    unsigned Align = GA->getGlobal()->getAlignment();
+
+    Known.Zero.setHighBits(16);
+    if (Align)
+      Known.Zero.setLowBits(Log2_32(Align));
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IID) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a17f5da..9723fc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -485,6 +485,7 @@
   INTERP_P1LV_F16,
   INTERP_P2_F16,
   PC_ADD_REL_OFFSET,
+  LDS,
   KILL,
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 78e6e39..74ed6f1 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -50,7 +50,7 @@
     } else if (FoldOp->isFI()) {
       FrameIndexToFold = FoldOp->getIndex();
     } else {
-      assert(FoldOp->isReg());
+      assert(FoldOp->isReg() || FoldOp->isGlobal());
       OpToFold = FoldOp;
     }
   }
@@ -67,6 +67,8 @@
     return Kind == MachineOperand::MO_Register;
   }
 
+  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
   bool isCommuted() const {
     return Commuted;
   }
@@ -230,7 +232,7 @@
     }
   }
 
-  if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
+  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
     auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
     if (Liveness != MachineBasicBlock::LQR_Dead)
@@ -277,6 +279,12 @@
     return true;
   }
 
+  if (Fold.isGlobal()) {
+    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+                   Fold.OpToFold->getTargetFlags());
+    return true;
+  }
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -368,7 +376,7 @@
       if ((Opc == AMDGPU::V_ADD_I32_e64 ||
            Opc == AMDGPU::V_SUB_I32_e64 ||
            Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
-          (OpToFold->isImm() || OpToFold->isFI())) {
+          (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
         MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
 
         // Verify the other operand is a VGPR, otherwise we would violate the
@@ -483,7 +491,8 @@
     return;
   }
 
-  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImmLike =
+      OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
   if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
@@ -884,7 +893,7 @@
   SmallVector<FoldCandidate, 4> FoldList;
   MachineOperand &Dst = MI.getOperand(0);
 
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
   if (FoldingImm) {
     unsigned NumLiteralUses = 0;
     MachineOperand *NonInlineUse = nullptr;
@@ -1232,7 +1241,8 @@
       }
 
       MachineOperand &OpToFold = MI.getOperand(1);
-      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+      bool FoldingImm =
+          OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
       // FIXME: We could also be folding things like TargetIndexes.
       if (!FoldingImm && !OpToFold.isReg())
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 64018fd..2500050 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3588,6 +3588,8 @@
   }
 
   case AMDGPU::GET_GROUPSTATICSIZE: {
+    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
         .add(MI.getOperand(0))
@@ -4776,7 +4778,10 @@
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
-  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+       (!GV->hasExternalLinkage() ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4784,7 +4789,12 @@
   SDLoc DL(GSD);
   EVT PtrVT = Op.getValueType();
 
-  // FIXME: Should not make address space based decisions here.
+  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+                                            SIInstrInfo::MO_ABS32_LO);
+    return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
+  }
+
   if (shouldEmitFixup(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
   else if (shouldEmitPCReloc(GV))
@@ -5773,6 +5783,18 @@
     return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
                                       Op->getOperand(1), Op->getOperand(2)), 0);
 
+  case Intrinsic::amdgcn_groupstaticsize: {
+    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return Op;
+
+    const Module *M = MF.getFunction().getParent();
+    const GlobalValue *GV =
+        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f977928..1923ac2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2703,7 +2703,7 @@
   const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
 
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
 
   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
     return true;
@@ -3012,7 +3012,7 @@
 
     switch (Desc.OpInfo[i].OperandType) {
     case MCOI::OPERAND_REGISTER:
-      if (MI.getOperand(i).isImm()) {
+      if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
@@ -3682,7 +3682,7 @@
     return isLegalRegOperand(MRI, OpInfo, MO);
 
   // Handle non-register types that are treated like immediates.
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
   return true;
 }
 
@@ -3739,7 +3739,7 @@
   }
 
   // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
 
   if (!DefinedRC) {
     // This operand expects an immediate.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 1a3e16a..cd1c7fda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -205,6 +205,10 @@
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
 
+def SIlds : SDNode<"AMDGPUISD::LDS",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
 def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
   SIload_d16,
   [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 630aeeb..2cb7e9f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1142,6 +1142,16 @@
   (S_MOV_B32 imm:$imm)
 >;
 
+def : GCNPat <
+  (VGPRImm<(SIlds tglobaladdr:$ga)>),
+  (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+  (SIlds tglobaladdr:$ga),
+  (S_MOV_B32 $ga)
+>;
+
 // FIXME: Workaround for ordering issue with peephole optimizer where
 // a register class copy interferes with immediate folding.  Should
 // use s_mov_b32, which can be shrunk to s_movk_i32
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index c180afd..7ee1781 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -95,6 +95,10 @@
           Src0.setSubReg(0);
           Src0.ChangeToFrameIndex(MovSrc.getIndex());
           ConstantFolded = true;
+        } else if (MovSrc.isGlobal()) {
+          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                          MovSrc.getTargetFlags());
+          ConstantFolded = true;
         }
 
         if (ConstantFolded) {