blob: 9de8ee888b964ef3b67a3ff35affcda2bba18bad [file] [log] [blame]
Tom Stellard880a80a2014-06-17 16:53:14 +00001//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass eliminates allocas by either converting them into vectors or
11// by migrating them to local address space.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/IR/IRBuilder.h"
Matt Arsenaultbafc9dc2016-03-11 08:20:50 +000019#include "llvm/IR/IntrinsicInst.h"
Matt Arsenaulte0132462016-01-30 05:19:45 +000020#include "llvm/IR/MDBuilder.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000021#include "llvm/Support/Debug.h"
Benjamin Kramer16132e62015-03-23 18:07:13 +000022#include "llvm/Support/raw_ostream.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000023
24#define DEBUG_TYPE "amdgpu-promote-alloca"
25
26using namespace llvm;
27
28namespace {
29
Matt Arsenaulte0132462016-01-30 05:19:45 +000030// FIXME: This can create globals so should be a module pass.
Matt Arsenaultbafc9dc2016-03-11 08:20:50 +000031class AMDGPUPromoteAlloca : public FunctionPass {
Matt Arsenaulte0132462016-01-30 05:19:45 +000032private:
33 const TargetMachine *TM;
Tom Stellard880a80a2014-06-17 16:53:14 +000034 Module *Mod;
Matt Arsenaulte0132462016-01-30 05:19:45 +000035 MDNode *MaxWorkGroupSizeRange;
36
37 // FIXME: This should be per-kernel.
Tom Stellard880a80a2014-06-17 16:53:14 +000038 int LocalMemAvailable;
39
Matt Arsenaulte0132462016-01-30 05:19:45 +000040 bool IsAMDGCN;
41 bool IsAMDHSA;
42
43 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
44 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
45
Tom Stellard880a80a2014-06-17 16:53:14 +000046public:
Matt Arsenaulte0132462016-01-30 05:19:45 +000047 static char ID;
48
49 AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
50 FunctionPass(ID),
51 TM(TM_),
52 Mod(nullptr),
53 MaxWorkGroupSizeRange(nullptr),
54 LocalMemAvailable(0),
55 IsAMDGCN(false),
56 IsAMDHSA(false) { }
57
Benjamin Kramer8c90fd72014-09-03 11:41:21 +000058 bool doInitialization(Module &M) override;
59 bool runOnFunction(Function &F) override;
Matt Arsenaulte0132462016-01-30 05:19:45 +000060
61 const char *getPassName() const override {
62 return "AMDGPU Promote Alloca";
63 }
64
Matt Arsenaultbafc9dc2016-03-11 08:20:50 +000065 void handleAlloca(AllocaInst &I);
Tom Stellard880a80a2014-06-17 16:53:14 +000066};
67
68} // End anonymous namespace
69
70char AMDGPUPromoteAlloca::ID = 0;
71
Matt Arsenaulte0132462016-01-30 05:19:45 +000072INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
73 "AMDGPU promote alloca to vector or LDS", false, false)
74
75char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
76
77
Tom Stellard880a80a2014-06-17 16:53:14 +000078bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Matt Arsenaulte0132462016-01-30 05:19:45 +000079 if (!TM)
80 return false;
81
Tom Stellard880a80a2014-06-17 16:53:14 +000082 Mod = &M;
Matt Arsenaulte0132462016-01-30 05:19:45 +000083
84 // The maximum workitem id.
85 //
86 // FIXME: Should get as subtarget property. Usually runtime enforced max is
87 // 256.
88 MDBuilder MDB(Mod->getContext());
89 MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
90
91 const Triple &TT = TM->getTargetTriple();
92
93 IsAMDGCN = TT.getArch() == Triple::amdgcn;
94 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
95
Tom Stellard880a80a2014-06-17 16:53:14 +000096 return false;
97}
98
99bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
Matt Arsenault8b175672016-02-02 19:32:42 +0000100 if (!TM || F.hasFnAttribute(Attribute::OptimizeNone))
Matt Arsenaulte0132462016-01-30 05:19:45 +0000101 return false;
102
Craig Toppere3dcce92015-08-01 22:20:21 +0000103 FunctionType *FTy = F.getFunctionType();
Tom Stellard880a80a2014-06-17 16:53:14 +0000104
105 // If the function has any arguments in the local address space, then it's
106 // possible these arguments require the entire local memory space, so
107 // we cannot use local memory in the pass.
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000108 for (Type *ParamTy : FTy->params()) {
109 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
110 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
111 LocalMemAvailable = 0;
Tom Stellard880a80a2014-06-17 16:53:14 +0000112 DEBUG(dbgs() << "Function has local memory argument. Promoting to "
113 "local memory disabled.\n");
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000114 return false;
Tom Stellard880a80a2014-06-17 16:53:14 +0000115 }
116 }
117
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000118 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
119 LocalMemAvailable = ST.getLocalMemorySize();
120 if (LocalMemAvailable == 0)
121 return false;
122
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000123 // Check how much local memory is being used by global objects
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000124 for (GlobalVariable &GV : Mod->globals()) {
125 if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000126 continue;
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000127
128 for (Use &U : GV.uses()) {
129 Instruction *Use = dyn_cast<Instruction>(U);
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000130 if (!Use)
Tom Stellard880a80a2014-06-17 16:53:14 +0000131 continue;
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000132
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000133 if (Use->getParent()->getParent() == &F)
134 LocalMemAvailable -=
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000135 Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
Tom Stellard880a80a2014-06-17 16:53:14 +0000136 }
137 }
138
139 LocalMemAvailable = std::max(0, LocalMemAvailable);
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000140 DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000141
Matt Arsenaultbafc9dc2016-03-11 08:20:50 +0000142 BasicBlock &EntryBB = *F.begin();
143 for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
144 AllocaInst *AI = dyn_cast<AllocaInst>(I);
145
146 ++I;
147 if (AI)
148 handleAlloca(*AI);
149 }
Tom Stellard880a80a2014-06-17 16:53:14 +0000150
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000151 return true;
Tom Stellard880a80a2014-06-17 16:53:14 +0000152}
153
Matt Arsenaulte0132462016-01-30 05:19:45 +0000154std::pair<Value *, Value *>
155AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
156 if (!IsAMDHSA) {
157 Function *LocalSizeYFn
158 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
159 Function *LocalSizeZFn
160 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
161
162 CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
163 CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
164
165 LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
166 LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
167
168 return std::make_pair(LocalSizeY, LocalSizeZ);
169 }
170
171 // We must read the size out of the dispatch pointer.
172 assert(IsAMDGCN);
173
174 // We are indexing into this struct, and want to extract the workgroup_size_*
175 // fields.
176 //
177 // typedef struct hsa_kernel_dispatch_packet_s {
178 // uint16_t header;
179 // uint16_t setup;
180 // uint16_t workgroup_size_x ;
181 // uint16_t workgroup_size_y;
182 // uint16_t workgroup_size_z;
183 // uint16_t reserved0;
184 // uint32_t grid_size_x ;
185 // uint32_t grid_size_y ;
186 // uint32_t grid_size_z;
187 //
188 // uint32_t private_segment_size;
189 // uint32_t group_segment_size;
190 // uint64_t kernel_object;
191 //
192 // #ifdef HSA_LARGE_MODEL
193 // void *kernarg_address;
194 // #elif defined HSA_LITTLE_ENDIAN
195 // void *kernarg_address;
196 // uint32_t reserved1;
197 // #else
198 // uint32_t reserved1;
199 // void *kernarg_address;
200 // #endif
201 // uint64_t reserved2;
202 // hsa_signal_t completion_signal; // uint64_t wrapper
203 // } hsa_kernel_dispatch_packet_t
204 //
205 Function *DispatchPtrFn
206 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
207
208 CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
209 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
210 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
211
212 // Size of the dispatch packet struct.
213 DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
214
215 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
216 Value *CastDispatchPtr = Builder.CreateBitCast(
217 DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
218
219 // We could do a single 64-bit load here, but it's likely that the basic
220 // 32-bit and extract sequence is already present, and it is probably easier
221 // to CSE this. The loads should be mergable later anyway.
222 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
223 LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
224
225 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
226 LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
227
228 MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
229 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
230 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
231 LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
232
233 // Extract y component. Upper half of LoadZU should be zero already.
234 Value *Y = Builder.CreateLShr(LoadXY, 16);
235
236 return std::make_pair(Y, LoadZU);
237}
238
239Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
240 Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
241
242 switch (N) {
243 case 0:
244 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
245 : Intrinsic::r600_read_tidig_x;
246 break;
247 case 1:
248 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
249 : Intrinsic::r600_read_tidig_y;
250 break;
251
252 case 2:
253 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
254 : Intrinsic::r600_read_tidig_z;
255 break;
256 default:
257 llvm_unreachable("invalid dimension");
258 }
259
260 Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
261 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
262 CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
263
264 return CI;
265}
266
Craig Toppere3dcce92015-08-01 22:20:21 +0000267static VectorType *arrayTypeToVecType(Type *ArrayTy) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000268 return VectorType::get(ArrayTy->getArrayElementType(),
269 ArrayTy->getArrayNumElements());
270}
271
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000272static Value *
273calculateVectorIndex(Value *Ptr,
274 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000275 if (isa<AllocaInst>(Ptr))
276 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
277
278 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
279
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000280 auto I = GEPIdx.find(GEP);
281 return I == GEPIdx.end() ? nullptr : I->second;
Tom Stellard880a80a2014-06-17 16:53:14 +0000282}
283
284static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
285 // FIXME we only support simple cases
286 if (GEP->getNumOperands() != 3)
287 return NULL;
288
289 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
290 if (!I0 || !I0->isZero())
291 return NULL;
292
293 return GEP->getOperand(2);
294}
295
Matt Arsenault642d2e72014-06-27 16:52:49 +0000296// Not an instruction handled below to turn into a vector.
297//
298// TODO: Check isTriviallyVectorizable for calls and handle other
299// instructions.
Matt Arsenault7227cc12015-07-28 18:47:00 +0000300static bool canVectorizeInst(Instruction *Inst, User *User) {
Matt Arsenault642d2e72014-06-27 16:52:49 +0000301 switch (Inst->getOpcode()) {
302 case Instruction::Load:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000303 case Instruction::BitCast:
304 case Instruction::AddrSpaceCast:
305 return true;
Matt Arsenault7227cc12015-07-28 18:47:00 +0000306 case Instruction::Store: {
307 // Must be the stored pointer operand, not a stored value.
308 StoreInst *SI = cast<StoreInst>(Inst);
309 return SI->getPointerOperand() == User;
310 }
Matt Arsenault642d2e72014-06-27 16:52:49 +0000311 default:
312 return false;
313 }
314}
315
Tom Stellard880a80a2014-06-17 16:53:14 +0000316static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000317 ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
Tom Stellard880a80a2014-06-17 16:53:14 +0000318
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000319 DEBUG(dbgs() << "Alloca candidate for vectorization\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000320
321 // FIXME: There is no reason why we can't support larger arrays, we
322 // are just being conservative for now.
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000323 if (!AllocaTy ||
324 AllocaTy->getElementType()->isVectorTy() ||
325 AllocaTy->getNumElements() > 4) {
326 DEBUG(dbgs() << " Cannot convert type to vector\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000327 return false;
328 }
329
330 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
331 std::vector<Value*> WorkList;
332 for (User *AllocaUser : Alloca->users()) {
333 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
334 if (!GEP) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000335 if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000336 return false;
337
Tom Stellard880a80a2014-06-17 16:53:14 +0000338 WorkList.push_back(AllocaUser);
339 continue;
340 }
341
342 Value *Index = GEPToVectorIndex(GEP);
343
344 // If we can't compute a vector index from this GEP, then we can't
345 // promote this alloca to vector.
346 if (!Index) {
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000347 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000348 return false;
349 }
350
351 GEPVectorIdx[GEP] = Index;
352 for (User *GEPUser : AllocaUser->users()) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000353 if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000354 return false;
355
Tom Stellard880a80a2014-06-17 16:53:14 +0000356 WorkList.push_back(GEPUser);
357 }
358 }
359
360 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
361
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000362 DEBUG(dbgs() << " Converting alloca to vector "
363 << *AllocaTy << " -> " << *VectorTy << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000364
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000365 for (Value *V : WorkList) {
366 Instruction *Inst = cast<Instruction>(V);
Tom Stellard880a80a2014-06-17 16:53:14 +0000367 IRBuilder<> Builder(Inst);
368 switch (Inst->getOpcode()) {
369 case Instruction::Load: {
370 Value *Ptr = Inst->getOperand(0);
371 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
372 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
373 Value *VecValue = Builder.CreateLoad(BitCast);
374 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
375 Inst->replaceAllUsesWith(ExtractElement);
376 Inst->eraseFromParent();
377 break;
378 }
379 case Instruction::Store: {
380 Value *Ptr = Inst->getOperand(1);
381 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
382 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
383 Value *VecValue = Builder.CreateLoad(BitCast);
384 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
385 Inst->getOperand(0),
386 Index);
387 Builder.CreateStore(NewVecValue, BitCast);
388 Inst->eraseFromParent();
389 break;
390 }
391 case Instruction::BitCast:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000392 case Instruction::AddrSpaceCast:
Tom Stellard880a80a2014-06-17 16:53:14 +0000393 break;
394
395 default:
396 Inst->dump();
Matt Arsenault642d2e72014-06-27 16:52:49 +0000397 llvm_unreachable("Inconsistency in instructions promotable to vector");
Tom Stellard880a80a2014-06-17 16:53:14 +0000398 }
399 }
400 return true;
401}
402
Matt Arsenaultad134842016-02-02 19:18:53 +0000403static bool isCallPromotable(CallInst *CI) {
404 // TODO: We might be able to handle some cases where the callee is a
405 // constantexpr bitcast of a function.
406 if (!CI->getCalledFunction())
407 return false;
408
409 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
410 if (!II)
411 return false;
412
413 switch (II->getIntrinsicID()) {
414 case Intrinsic::memcpy:
Matt Arsenault7e747f12016-02-02 20:28:10 +0000415 case Intrinsic::memmove:
Matt Arsenaultad134842016-02-02 19:18:53 +0000416 case Intrinsic::memset:
417 case Intrinsic::lifetime_start:
418 case Intrinsic::lifetime_end:
419 case Intrinsic::invariant_start:
420 case Intrinsic::invariant_end:
421 case Intrinsic::invariant_group_barrier:
Matt Arsenault7e747f12016-02-02 20:28:10 +0000422 case Intrinsic::objectsize:
Matt Arsenaultad134842016-02-02 19:18:53 +0000423 return true;
424 default:
425 return false;
426 }
427}
428
Tom Stellard5b2927f2014-10-31 20:52:04 +0000429static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000430 for (User *User : Val->users()) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000431 if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
Tom Stellard880a80a2014-06-17 16:53:14 +0000432 continue;
Matt Arsenaultad134842016-02-02 19:18:53 +0000433
Matt Arsenaultfdcd39a2015-07-28 18:29:14 +0000434 if (CallInst *CI = dyn_cast<CallInst>(User)) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000435 if (!isCallPromotable(CI))
Matt Arsenaultfdcd39a2015-07-28 18:29:14 +0000436 return false;
437
Tom Stellard880a80a2014-06-17 16:53:14 +0000438 WorkList.push_back(User);
439 continue;
440 }
Tom Stellard5b2927f2014-10-31 20:52:04 +0000441
Tom Stellard5b2927f2014-10-31 20:52:04 +0000442 Instruction *UseInst = dyn_cast<Instruction>(User);
443 if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
444 return false;
445
Matt Arsenault7227cc12015-07-28 18:47:00 +0000446 if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
447 // Reject if the stored value is not the pointer operand.
448 if (SI->getPointerOperand() != Val)
449 return false;
450 }
451
Tom Stellard880a80a2014-06-17 16:53:14 +0000452 if (!User->getType()->isPointerTy())
453 continue;
Tom Stellard5b2927f2014-10-31 20:52:04 +0000454
Matt Arsenaultde420812016-02-02 21:16:12 +0000455 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
456 // Be conservative if an address could be computed outside the bounds of
457 // the alloca.
458 if (!GEP->isInBounds())
459 return false;
460 }
461
Tom Stellard880a80a2014-06-17 16:53:14 +0000462 WorkList.push_back(User);
Matt Arsenaultad134842016-02-02 19:18:53 +0000463 if (!collectUsesWithPtrTypes(User, WorkList))
464 return false;
Tom Stellard880a80a2014-06-17 16:53:14 +0000465 }
Matt Arsenaultad134842016-02-02 19:18:53 +0000466
467 return true;
Tom Stellard880a80a2014-06-17 16:53:14 +0000468}
469
Matt Arsenaultbafc9dc2016-03-11 08:20:50 +0000470void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
Matt Arsenault19c54882015-08-26 18:37:13 +0000471 if (!I.isStaticAlloca())
472 return;
473
Tom Stellard880a80a2014-06-17 16:53:14 +0000474 IRBuilder<> Builder(&I);
475
476 // First try to replace the alloca with a vector
477 Type *AllocaTy = I.getAllocatedType();
478
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000479 DEBUG(dbgs() << "Trying to promote " << I << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000480
481 if (tryPromoteAllocaToVector(&I))
482 return;
483
484 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
485
486 // FIXME: This is the maximum work group size. We should try to get
487 // value from the reqd_work_group_size function attribute if it is
488 // available.
489 unsigned WorkGroupSize = 256;
Mehdi Amini46a43552015-03-04 18:43:29 +0000490 int AllocaSize =
491 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
Tom Stellard880a80a2014-06-17 16:53:14 +0000492
493 if (AllocaSize > LocalMemAvailable) {
494 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
495 return;
496 }
497
Tom Stellard5b2927f2014-10-31 20:52:04 +0000498 std::vector<Value*> WorkList;
499
500 if (!collectUsesWithPtrTypes(&I, WorkList)) {
501 DEBUG(dbgs() << " Do not know how to convert all uses\n");
502 return;
503 }
504
Tom Stellard880a80a2014-06-17 16:53:14 +0000505 DEBUG(dbgs() << "Promoting alloca to local memory\n");
506 LocalMemAvailable -= AllocaSize;
507
Matt Arsenaultcf84e262016-02-05 19:47:23 +0000508 Function *F = I.getParent()->getParent();
509
David Blaikie156d46e2015-03-24 23:34:31 +0000510 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
Tom Stellard880a80a2014-06-17 16:53:14 +0000511 GlobalVariable *GV = new GlobalVariable(
Matt Arsenaultcf84e262016-02-05 19:47:23 +0000512 *Mod, GVTy, false, GlobalValue::InternalLinkage,
513 UndefValue::get(GVTy),
514 Twine(F->getName()) + Twine('.') + I.getName(),
515 nullptr,
516 GlobalVariable::NotThreadLocal,
517 AMDGPUAS::LOCAL_ADDRESS);
518 GV->setUnnamedAddr(true);
519 GV->setAlignment(I.getAlignment());
Tom Stellard880a80a2014-06-17 16:53:14 +0000520
Matt Arsenaulte0132462016-01-30 05:19:45 +0000521 Value *TCntY, *TCntZ;
Tom Stellard880a80a2014-06-17 16:53:14 +0000522
Matt Arsenaulte0132462016-01-30 05:19:45 +0000523 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
524 Value *TIdX = getWorkitemID(Builder, 0);
525 Value *TIdY = getWorkitemID(Builder, 1);
526 Value *TIdZ = getWorkitemID(Builder, 2);
Tom Stellard880a80a2014-06-17 16:53:14 +0000527
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000528 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000529 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000530 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000531 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
532 TID = Builder.CreateAdd(TID, TIdZ);
533
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000534 Value *Indices[] = {
535 Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
536 TID
537 };
Tom Stellard880a80a2014-06-17 16:53:14 +0000538
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000539 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
Tom Stellard880a80a2014-06-17 16:53:14 +0000540 I.mutateType(Offset->getType());
541 I.replaceAllUsesWith(Offset);
542 I.eraseFromParent();
543
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000544 for (Value *V : WorkList) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000545 CallInst *Call = dyn_cast<CallInst>(V);
546 if (!Call) {
547 Type *EltTy = V->getType()->getPointerElementType();
548 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenault65f67e42014-09-15 15:41:44 +0000549
550 // The operand's value should be corrected on its own.
551 if (isa<AddrSpaceCastInst>(V))
552 continue;
553
554 // FIXME: It doesn't really make sense to try to do this for all
555 // instructions.
Tom Stellard880a80a2014-06-17 16:53:14 +0000556 V->mutateType(NewTy);
557 continue;
558 }
559
560 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
561 if (!Intr) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000562 // FIXME: What is this for? It doesn't make sense to promote arbitrary
563 // function calls. If the call is to a defined function that can also be
564 // promoted, we should be able to do this once that function is also
565 // rewritten.
566
Tom Stellard880a80a2014-06-17 16:53:14 +0000567 std::vector<Type*> ArgTypes;
568 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
569 ArgIdx != ArgEnd; ++ArgIdx) {
570 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
571 }
572 Function *F = Call->getCalledFunction();
573 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
574 F->isVarArg());
Yaron Keren75e0c4b2015-03-27 17:51:30 +0000575 Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
576 NewType, F->getAttributes());
Tom Stellard880a80a2014-06-17 16:53:14 +0000577 Function *NewF = cast<Function>(C);
578 Call->setCalledFunction(NewF);
579 continue;
580 }
581
582 Builder.SetInsertPoint(Intr);
583 switch (Intr->getIntrinsicID()) {
584 case Intrinsic::lifetime_start:
585 case Intrinsic::lifetime_end:
586 // These intrinsics are for address space 0 only
587 Intr->eraseFromParent();
588 continue;
589 case Intrinsic::memcpy: {
590 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
591 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000592 MemCpy->getLength(), MemCpy->getAlignment(),
593 MemCpy->isVolatile());
Tom Stellard880a80a2014-06-17 16:53:14 +0000594 Intr->eraseFromParent();
595 continue;
596 }
Matt Arsenault7e747f12016-02-02 20:28:10 +0000597 case Intrinsic::memmove: {
598 MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
599 Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
600 MemMove->getLength(), MemMove->getAlignment(),
601 MemMove->isVolatile());
602 Intr->eraseFromParent();
603 continue;
604 }
Tom Stellard880a80a2014-06-17 16:53:14 +0000605 case Intrinsic::memset: {
606 MemSetInst *MemSet = cast<MemSetInst>(Intr);
607 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000608 MemSet->getLength(), MemSet->getAlignment(),
Tom Stellard880a80a2014-06-17 16:53:14 +0000609 MemSet->isVolatile());
610 Intr->eraseFromParent();
611 continue;
612 }
Matt Arsenault0b783ef02016-01-22 19:47:54 +0000613 case Intrinsic::invariant_start:
614 case Intrinsic::invariant_end:
615 case Intrinsic::invariant_group_barrier:
616 Intr->eraseFromParent();
617 // FIXME: I think the invariant marker should still theoretically apply,
618 // but the intrinsics need to be changed to accept pointers with any
619 // address space.
620 continue;
Matt Arsenault7e747f12016-02-02 20:28:10 +0000621 case Intrinsic::objectsize: {
622 Value *Src = Intr->getOperand(0);
623 Type *SrcTy = Src->getType()->getPointerElementType();
624 Function *ObjectSize = Intrinsic::getDeclaration(Mod,
625 Intrinsic::objectsize,
626 { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
627 );
628
629 CallInst *NewCall
630 = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
631 Intr->replaceAllUsesWith(NewCall);
632 Intr->eraseFromParent();
633 continue;
634 }
Tom Stellard880a80a2014-06-17 16:53:14 +0000635 default:
636 Intr->dump();
637 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
638 }
639 }
640}
641
Matt Arsenaulte0132462016-01-30 05:19:45 +0000642FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
643 return new AMDGPUPromoteAlloca(TM);
Tom Stellard880a80a2014-06-17 16:53:14 +0000644}