blob: 102a88074dfd84abb5f93f34d865617c6628260a [file] [log] [blame]
Yaxun Liude4b88d2017-10-10 19:39:48 +00001//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// \file
11// \brief This post-linking pass replaces the function pointer of enqueued
12// block kernel with a global variable (runtime handle) and adds
13// "runtime-handle" attribute to the enqueued block kernel.
14//
15// In LLVM CodeGen the runtime-handle metadata will be translated to
16// RuntimeHandle metadata in code object. Runtime allocates a global buffer
17// for each kernel with RuntimeHandel metadata and saves the kernel address
18// required for the AQL packet into the buffer. __enqueue_kernel function
19// in device library knows that the invoke function pointer in the block
20// literal is actually runtime handle and loads the kernel address from it
21// and put it into AQL packet for dispatching.
22//
23// This cannot be done in FE since FE cannot create a unique global variable
24// with external linkage across LLVM modules. The global variable with internal
25// linkage does not work since optimization passes will try to replace loads
26// of the global variable with its initialization value.
27//
Yaxun Liuc928f2a2017-10-30 14:30:28 +000028// It also identifies the kernels directly or indirectly enqueues kernels
29// and adds "calls-enqueue-kernel" function attribute to them, which will
30// be used to determine whether to emit runtime metadata for the kernel
31// enqueue related hidden kernel arguments.
32//
Yaxun Liude4b88d2017-10-10 19:39:48 +000033//===----------------------------------------------------------------------===//
34
35#include "AMDGPU.h"
Yaxun Liuc928f2a2017-10-30 14:30:28 +000036#include "llvm/ADT/DenseSet.h"
Yaxun Liude4b88d2017-10-10 19:39:48 +000037#include "llvm/ADT/StringRef.h"
38#include "llvm/IR/Constants.h"
Yaxun Liuc928f2a2017-10-30 14:30:28 +000039#include "llvm/IR/Instructions.h"
Yaxun Liua99e7d82018-03-12 16:34:06 +000040#include "llvm/IR/Mangler.h"
Yaxun Liude4b88d2017-10-10 19:39:48 +000041#include "llvm/IR/Module.h"
Yaxun Liuc928f2a2017-10-30 14:30:28 +000042#include "llvm/IR/User.h"
Yaxun Liude4b88d2017-10-10 19:39:48 +000043#include "llvm/Pass.h"
44#include "llvm/Support/Debug.h"
45#include "llvm/Support/raw_ostream.h"
46
47#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
48
49using namespace llvm;
50
51namespace {
52
53/// \brief Lower enqueued blocks.
54class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
55public:
56 static char ID;
57
58 explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
59
60private:
61 bool runOnModule(Module &M) override;
62};
63
64} // end anonymous namespace
65
66char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
67
68char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
69 AMDGPUOpenCLEnqueuedBlockLowering::ID;
70
71INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
72 "Lower OpenCL enqueued blocks", false, false)
73
74ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
75 return new AMDGPUOpenCLEnqueuedBlockLowering();
76}
77
Yaxun Liuc928f2a2017-10-30 14:30:28 +000078/// Collect direct or indrect callers of \p F and save them
79/// to \p Callers.
80static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
81 for (auto U : F->users()) {
82 if (auto *CI = dyn_cast<CallInst>(&*U)) {
83 auto *Caller = CI->getParent()->getParent();
84 if (Callers.count(Caller))
85 continue;
86 Callers.insert(Caller);
87 collectCallers(Caller, Callers);
88 }
89 }
90}
91
Yaxun Liude4b88d2017-10-10 19:39:48 +000092bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
Yaxun Liuc928f2a2017-10-30 14:30:28 +000093 DenseSet<Function *> Callers;
Yaxun Liude4b88d2017-10-10 19:39:48 +000094 auto &C = M.getContext();
Yaxun Liude4b88d2017-10-10 19:39:48 +000095 bool Changed = false;
96 for (auto &F : M.functions()) {
97 if (F.hasFnAttribute("enqueued-block")) {
Yaxun Liua99e7d82018-03-12 16:34:06 +000098 if (!F.hasName()) {
99 SmallString<64> Name;
100 Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
101 M.getDataLayout());
102 F.setName(Name);
103 }
104 auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
105 auto *GV = new GlobalVariable(
106 M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS),
107 /*IsConstant=*/true, GlobalValue::ExternalLinkage,
108 /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr,
109 GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS,
110 /*IsExternallyInitialized=*/true);
111 DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
112
Yaxun Liu46439e82018-03-06 16:04:39 +0000113 for (auto U : F.users()) {
114 if (!isa<ConstantExpr>(&*U))
115 continue;
116 auto *BitCast = cast<ConstantExpr>(&*U);
Yaxun Liu46439e82018-03-06 16:04:39 +0000117 auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType());
118 BitCast->replaceAllUsesWith(NewPtr);
119 F.addFnAttr("runtime-handle", RuntimeHandle);
120 F.setLinkage(GlobalValue::ExternalLinkage);
Yaxun Liuc928f2a2017-10-30 14:30:28 +0000121
Yaxun Liu46439e82018-03-06 16:04:39 +0000122 // Collect direct or indirect callers of enqueue_kernel.
123 for (auto U : NewPtr->users()) {
124 if (auto *I = dyn_cast<Instruction>(&*U)) {
125 auto *F = I->getParent()->getParent();
126 Callers.insert(F);
127 collectCallers(F, Callers);
128 }
Yaxun Liuc928f2a2017-10-30 14:30:28 +0000129 }
Yaxun Liu46439e82018-03-06 16:04:39 +0000130 Changed = true;
Yaxun Liuc928f2a2017-10-30 14:30:28 +0000131 }
Yaxun Liude4b88d2017-10-10 19:39:48 +0000132 }
133 }
Yaxun Liuc928f2a2017-10-30 14:30:28 +0000134
135 for (auto F : Callers) {
136 if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
137 continue;
138 F->addFnAttr("calls-enqueue-kernel");
139 }
Yaxun Liude4b88d2017-10-10 19:39:48 +0000140 return Changed;
141}