AMDHSA: Code object v3 updates

- Do not emit following assembler directives:
  - .hsa_code_object_version
  - .hsa_code_object_isa
  - .amd_amdgpu_isa
  - .amd_amdgpu_hsa_metadata
  - .amd_amdgpu_pal_metadata
- Do not emit .note entries
- Cleanup and bring in sync kernel descriptor header file
- Emit kernel descriptor into .rodata with appropriate relocations and
  alignments

llvm-svn: 334519
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 0cb9cc0..8e56ec6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -116,6 +116,10 @@
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
+    return;
+
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
     return;
@@ -126,10 +130,6 @@
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     readPALMetadata(M);
 
-  // Deprecated notes are not emitted for code object v3.
-  if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits()))
-    return;
-
   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
@@ -141,6 +141,10 @@
 }
 
 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  // TODO: Add metadata to code object v3.
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
+    return;
 
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
@@ -186,8 +190,11 @@
 }
 
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
-  const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
-  if (!MFI->isEntryFunction())
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA)
     return;
 
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
@@ -205,7 +212,27 @@
                                getHSADebugProps(*MF, CurrentProgramInfo));
 }
 
+void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  if (!MFI.isEntryFunction())
+    return;
+  if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
+      TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+
+  SmallString<128> KernelName;
+  getNameWithPrefix(KernelName, &MF->getFunction());
+  getTargetStreamer()->EmitAmdhsaKernelDescriptor(
+      KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo));
+}
+
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
+      TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+    AsmPrinter::EmitFunctionEntryLabel();
+    return;
+  }
+
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) {
@@ -288,6 +315,70 @@
                               false);
 }
 
+uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
+    const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  uint16_t KernelCodeProperties = 0;
+
+  if (MFI.hasPrivateSegmentBuffer()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+  }
+  if (MFI.hasDispatchPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+  }
+  if (MFI.hasQueuePtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+  }
+  if (MFI.hasKernargSegmentPtr()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+  }
+  if (MFI.hasDispatchID()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+  }
+  if (MFI.hasFlatScratchInit()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
+  }
+  if (MFI.hasGridWorkgroupCountX()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
+  }
+  if (MFI.hasGridWorkgroupCountY()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
+  }
+  if (MFI.hasGridWorkgroupCountZ()) {
+    KernelCodeProperties |=
+        amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
+  }
+
+  return KernelCodeProperties;
+}
+
+amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
+    const MachineFunction &MF,
+    const SIProgramInfo &PI) const {
+  amdhsa::kernel_descriptor_t KernelDescriptor;
+  memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
+
+  assert(isUInt<32>(PI.ScratchSize));
+  assert(isUInt<32>(PI.ComputePGMRSrc1));
+  assert(isUInt<32>(PI.ComputePGMRSrc2));
+
+  KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
+  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
+  KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
+  KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+  KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
+
+  return KernelDescriptor;
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   CurrentProgramInfo = SIProgramInfo();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index bc2cb4b..e1a9559 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -20,6 +20,7 @@
 #include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include <cstddef>
 #include <cstdint>
 #include <limits>
@@ -148,6 +149,13 @@
                                   uint64_t CodeSize,
                                   const AMDGPUMachineFunction* MFI);
 
+  uint16_t getAmdhsaKernelCodeProperties(
+      const MachineFunction &MF) const;
+
+  amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor(
+      const MachineFunction &MF,
+      const SIProgramInfo &PI) const;
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
@@ -180,6 +188,8 @@
 
   void EmitFunctionBodyStart() override;
 
+  void EmitFunctionBodyEnd() override;
+
   void EmitFunctionEntryLabel() override;
 
   void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 2455a93..6a4b205 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -196,6 +196,12 @@
   return true;
 }
 
+void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
+    StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KernelDescriptor) {
+  // FIXME: not supported yet.
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
@@ -362,3 +368,57 @@
   );
   return true;
 }
+
+void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
+    StringRef KernelName,
+    const amdhsa::kernel_descriptor_t &KernelDescriptor) {
+  auto &Streamer = getStreamer();
+  auto &Context = Streamer.getContext();
+  auto &ObjectFileInfo = *Context.getObjectFileInfo();
+  auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
+
+  Streamer.PushSection();
+  Streamer.SwitchSection(&ReadOnlySection);
+
+  // CP microcode requires the kernel descriptor to be allocated on 64 byte
+  // alignment.
+  Streamer.EmitValueToAlignment(64, 0, 1, 0);
+  if (ReadOnlySection.getAlignment() < 64)
+    ReadOnlySection.setAlignment(64);
+
+  MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd")));
+  KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL);
+  KernelDescriptorSymbol->setType(ELF::STT_OBJECT);
+  KernelDescriptorSymbol->setSize(
+      MCConstantExpr::create(sizeof(KernelDescriptor), Context));
+
+  MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>(
+      Context.getOrCreateSymbol(Twine(KernelName)));
+  KernelCodeSymbol->setBinding(ELF::STB_LOCAL);
+
+  Streamer.EmitLabel(KernelDescriptorSymbol);
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor),
+      offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
+  // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
+  // expression being created is:
+  //   (start of kernel code) - (start of kernel descriptor)
+  // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
+  Streamer.EmitValue(MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(
+          KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
+      MCSymbolRefExpr::create(
+          KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
+      Context),
+      sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
+  Streamer.EmitBytes(StringRef(
+      (const char*)&(KernelDescriptor) +
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
+      sizeof(KernelDescriptor) -
+          offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) -
+          sizeof(KernelDescriptor.kernel_code_entry_byte_offset)));
+
+  Streamer.PopSection();
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 74c15e4..4267b55 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -14,6 +14,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/AMDHSAKernelDescriptor.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -62,6 +63,10 @@
 
   /// \returns True on success, false on failure.
   virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0;
+
+  virtual void EmitAmdhsaKernelDescriptor(
+      StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor) = 0;
 };
 
 class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
@@ -87,6 +92,10 @@
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -119,6 +128,10 @@
 
   /// \returns True on success, false on failure.
   bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override;
+
+  void EmitAmdhsaKernelDescriptor(
+      StringRef KernelName,
+      const amdhsa::kernel_descriptor_t &KernelDescriptor) override;
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 3896bcc..da9b98b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -248,8 +248,8 @@
   Stream.flush();
 }
 
-bool hasCodeObjectV3(const FeatureBitset &Features) {
-  return Features.test(FeatureCodeObjectV3);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI) {
+  return STI->getFeatureBits().test(FeatureCodeObjectV3);
 }
 
 unsigned getWavefrontSize(const FeatureBitset &Features) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 54932d2..2abc18d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -59,9 +59,9 @@
 /// Streams isa version string for given subtarget \p STI into \p Stream.
 void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
 
-/// \returns True if given subtarget \p Features support code object version 3,
+/// \returns True if given subtarget \p STI supports code object version 3,
 /// false otherwise.
-bool hasCodeObjectV3(const FeatureBitset &Features);
+bool hasCodeObjectV3(const MCSubtargetInfo *STI);
 
 /// \returns Wavefront size for given subtarget \p Features.
 unsigned getWavefrontSize(const FeatureBitset &Features);