blob: d22fa3235219e264070a222e989c16a5127600d0 [file] [log] [blame]
Tom Stellard880a80a2014-06-17 16:53:14 +00001//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass eliminates allocas by either converting them into vectors or
11// by migrating them to local address space.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/InstVisitor.h"
Matt Arsenaulte0132462016-01-30 05:19:45 +000020#include "llvm/IR/MDBuilder.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000021#include "llvm/Support/Debug.h"
Benjamin Kramer16132e62015-03-23 18:07:13 +000022#include "llvm/Support/raw_ostream.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000023
24#define DEBUG_TYPE "amdgpu-promote-alloca"
25
26using namespace llvm;
27
28namespace {
29
Matt Arsenaulte0132462016-01-30 05:19:45 +000030// FIXME: This can create globals so should be a module pass.
Tom Stellard880a80a2014-06-17 16:53:14 +000031class AMDGPUPromoteAlloca : public FunctionPass,
Matt Arsenaulte0132462016-01-30 05:19:45 +000032 public InstVisitor<AMDGPUPromoteAlloca> {
33private:
34 const TargetMachine *TM;
Tom Stellard880a80a2014-06-17 16:53:14 +000035 Module *Mod;
Matt Arsenaulte0132462016-01-30 05:19:45 +000036 MDNode *MaxWorkGroupSizeRange;
37
38 // FIXME: This should be per-kernel.
Tom Stellard880a80a2014-06-17 16:53:14 +000039 int LocalMemAvailable;
40
Matt Arsenaulte0132462016-01-30 05:19:45 +000041 bool IsAMDGCN;
42 bool IsAMDHSA;
43
44 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
45 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
46
Tom Stellard880a80a2014-06-17 16:53:14 +000047public:
Matt Arsenaulte0132462016-01-30 05:19:45 +000048 static char ID;
49
50 AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
51 FunctionPass(ID),
52 TM(TM_),
53 Mod(nullptr),
54 MaxWorkGroupSizeRange(nullptr),
55 LocalMemAvailable(0),
56 IsAMDGCN(false),
57 IsAMDHSA(false) { }
58
Benjamin Kramer8c90fd72014-09-03 11:41:21 +000059 bool doInitialization(Module &M) override;
60 bool runOnFunction(Function &F) override;
Matt Arsenaulte0132462016-01-30 05:19:45 +000061
62 const char *getPassName() const override {
63 return "AMDGPU Promote Alloca";
64 }
65
Tom Stellard880a80a2014-06-17 16:53:14 +000066 void visitAlloca(AllocaInst &I);
67};
68
69} // End anonymous namespace
70
71char AMDGPUPromoteAlloca::ID = 0;
72
Matt Arsenaulte0132462016-01-30 05:19:45 +000073INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
74 "AMDGPU promote alloca to vector or LDS", false, false)
75
76char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
77
78
Tom Stellard880a80a2014-06-17 16:53:14 +000079bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Matt Arsenaulte0132462016-01-30 05:19:45 +000080 if (!TM)
81 return false;
82
Tom Stellard880a80a2014-06-17 16:53:14 +000083 Mod = &M;
Matt Arsenaulte0132462016-01-30 05:19:45 +000084
85 // The maximum workitem id.
86 //
87 // FIXME: Should get as subtarget property. Usually runtime enforced max is
88 // 256.
89 MDBuilder MDB(Mod->getContext());
90 MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
91
92 const Triple &TT = TM->getTargetTriple();
93
94 IsAMDGCN = TT.getArch() == Triple::amdgcn;
95 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
96
Tom Stellard880a80a2014-06-17 16:53:14 +000097 return false;
98}
99
100bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
Matt Arsenault8b175672016-02-02 19:32:42 +0000101 if (!TM || F.hasFnAttribute(Attribute::OptimizeNone))
Matt Arsenaulte0132462016-01-30 05:19:45 +0000102 return false;
103
Craig Toppere3dcce92015-08-01 22:20:21 +0000104 FunctionType *FTy = F.getFunctionType();
Tom Stellard880a80a2014-06-17 16:53:14 +0000105
106 // If the function has any arguments in the local address space, then it's
107 // possible these arguments require the entire local memory space, so
108 // we cannot use local memory in the pass.
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000109 for (Type *ParamTy : FTy->params()) {
110 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
111 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
112 LocalMemAvailable = 0;
Tom Stellard880a80a2014-06-17 16:53:14 +0000113 DEBUG(dbgs() << "Function has local memory argument. Promoting to "
114 "local memory disabled.\n");
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000115 return false;
Tom Stellard880a80a2014-06-17 16:53:14 +0000116 }
117 }
118
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000119 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
120 LocalMemAvailable = ST.getLocalMemorySize();
121 if (LocalMemAvailable == 0)
122 return false;
123
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000124 // Check how much local memory is being used by global objects
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000125 for (GlobalVariable &GV : Mod->globals()) {
126 if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000127 continue;
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000128
129 for (Use &U : GV.uses()) {
130 Instruction *Use = dyn_cast<Instruction>(U);
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000131 if (!Use)
Tom Stellard880a80a2014-06-17 16:53:14 +0000132 continue;
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000133
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000134 if (Use->getParent()->getParent() == &F)
135 LocalMemAvailable -=
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000136 Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
Tom Stellard880a80a2014-06-17 16:53:14 +0000137 }
138 }
139
140 LocalMemAvailable = std::max(0, LocalMemAvailable);
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000141 DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000142
143 visit(F);
144
Matt Arsenaulte5737f72016-02-02 19:18:57 +0000145 return true;
Tom Stellard880a80a2014-06-17 16:53:14 +0000146}
147
Matt Arsenaulte0132462016-01-30 05:19:45 +0000148std::pair<Value *, Value *>
149AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
150 if (!IsAMDHSA) {
151 Function *LocalSizeYFn
152 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
153 Function *LocalSizeZFn
154 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
155
156 CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
157 CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
158
159 LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
160 LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
161
162 return std::make_pair(LocalSizeY, LocalSizeZ);
163 }
164
165 // We must read the size out of the dispatch pointer.
166 assert(IsAMDGCN);
167
168 // We are indexing into this struct, and want to extract the workgroup_size_*
169 // fields.
170 //
171 // typedef struct hsa_kernel_dispatch_packet_s {
172 // uint16_t header;
173 // uint16_t setup;
174 // uint16_t workgroup_size_x ;
175 // uint16_t workgroup_size_y;
176 // uint16_t workgroup_size_z;
177 // uint16_t reserved0;
178 // uint32_t grid_size_x ;
179 // uint32_t grid_size_y ;
180 // uint32_t grid_size_z;
181 //
182 // uint32_t private_segment_size;
183 // uint32_t group_segment_size;
184 // uint64_t kernel_object;
185 //
186 // #ifdef HSA_LARGE_MODEL
187 // void *kernarg_address;
188 // #elif defined HSA_LITTLE_ENDIAN
189 // void *kernarg_address;
190 // uint32_t reserved1;
191 // #else
192 // uint32_t reserved1;
193 // void *kernarg_address;
194 // #endif
195 // uint64_t reserved2;
196 // hsa_signal_t completion_signal; // uint64_t wrapper
197 // } hsa_kernel_dispatch_packet_t
198 //
199 Function *DispatchPtrFn
200 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
201
202 CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
203 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
204 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
205
206 // Size of the dispatch packet struct.
207 DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
208
209 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
210 Value *CastDispatchPtr = Builder.CreateBitCast(
211 DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
212
213 // We could do a single 64-bit load here, but it's likely that the basic
214 // 32-bit and extract sequence is already present, and it is probably easier
215 // to CSE this. The loads should be mergable later anyway.
216 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
217 LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
218
219 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
220 LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
221
222 MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
223 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
224 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
225 LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
226
227 // Extract y component. Upper half of LoadZU should be zero already.
228 Value *Y = Builder.CreateLShr(LoadXY, 16);
229
230 return std::make_pair(Y, LoadZU);
231}
232
233Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
234 Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
235
236 switch (N) {
237 case 0:
238 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
239 : Intrinsic::r600_read_tidig_x;
240 break;
241 case 1:
242 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
243 : Intrinsic::r600_read_tidig_y;
244 break;
245
246 case 2:
247 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
248 : Intrinsic::r600_read_tidig_z;
249 break;
250 default:
251 llvm_unreachable("invalid dimension");
252 }
253
254 Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
255 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
256 CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
257
258 return CI;
259}
260
Craig Toppere3dcce92015-08-01 22:20:21 +0000261static VectorType *arrayTypeToVecType(Type *ArrayTy) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000262 return VectorType::get(ArrayTy->getArrayElementType(),
263 ArrayTy->getArrayNumElements());
264}
265
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000266static Value *
267calculateVectorIndex(Value *Ptr,
268 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000269 if (isa<AllocaInst>(Ptr))
270 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
271
272 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
273
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000274 auto I = GEPIdx.find(GEP);
275 return I == GEPIdx.end() ? nullptr : I->second;
Tom Stellard880a80a2014-06-17 16:53:14 +0000276}
277
278static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
279 // FIXME we only support simple cases
280 if (GEP->getNumOperands() != 3)
281 return NULL;
282
283 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
284 if (!I0 || !I0->isZero())
285 return NULL;
286
287 return GEP->getOperand(2);
288}
289
Matt Arsenault642d2e72014-06-27 16:52:49 +0000290// Not an instruction handled below to turn into a vector.
291//
292// TODO: Check isTriviallyVectorizable for calls and handle other
293// instructions.
Matt Arsenault7227cc12015-07-28 18:47:00 +0000294static bool canVectorizeInst(Instruction *Inst, User *User) {
Matt Arsenault642d2e72014-06-27 16:52:49 +0000295 switch (Inst->getOpcode()) {
296 case Instruction::Load:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000297 case Instruction::BitCast:
298 case Instruction::AddrSpaceCast:
299 return true;
Matt Arsenault7227cc12015-07-28 18:47:00 +0000300 case Instruction::Store: {
301 // Must be the stored pointer operand, not a stored value.
302 StoreInst *SI = cast<StoreInst>(Inst);
303 return SI->getPointerOperand() == User;
304 }
Matt Arsenault642d2e72014-06-27 16:52:49 +0000305 default:
306 return false;
307 }
308}
309
Tom Stellard880a80a2014-06-17 16:53:14 +0000310static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000311 ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
Tom Stellard880a80a2014-06-17 16:53:14 +0000312
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000313 DEBUG(dbgs() << "Alloca candidate for vectorization\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000314
315 // FIXME: There is no reason why we can't support larger arrays, we
316 // are just being conservative for now.
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000317 if (!AllocaTy ||
318 AllocaTy->getElementType()->isVectorTy() ||
319 AllocaTy->getNumElements() > 4) {
320 DEBUG(dbgs() << " Cannot convert type to vector\n");
Tom Stellard880a80a2014-06-17 16:53:14 +0000321 return false;
322 }
323
324 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
325 std::vector<Value*> WorkList;
326 for (User *AllocaUser : Alloca->users()) {
327 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
328 if (!GEP) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000329 if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000330 return false;
331
Tom Stellard880a80a2014-06-17 16:53:14 +0000332 WorkList.push_back(AllocaUser);
333 continue;
334 }
335
336 Value *Index = GEPToVectorIndex(GEP);
337
338 // If we can't compute a vector index from this GEP, then we can't
339 // promote this alloca to vector.
340 if (!Index) {
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000341 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000342 return false;
343 }
344
345 GEPVectorIdx[GEP] = Index;
346 for (User *GEPUser : AllocaUser->users()) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000347 if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000348 return false;
349
Tom Stellard880a80a2014-06-17 16:53:14 +0000350 WorkList.push_back(GEPUser);
351 }
352 }
353
354 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
355
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000356 DEBUG(dbgs() << " Converting alloca to vector "
357 << *AllocaTy << " -> " << *VectorTy << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000358
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000359 for (Value *V : WorkList) {
360 Instruction *Inst = cast<Instruction>(V);
Tom Stellard880a80a2014-06-17 16:53:14 +0000361 IRBuilder<> Builder(Inst);
362 switch (Inst->getOpcode()) {
363 case Instruction::Load: {
364 Value *Ptr = Inst->getOperand(0);
365 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
366 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
367 Value *VecValue = Builder.CreateLoad(BitCast);
368 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
369 Inst->replaceAllUsesWith(ExtractElement);
370 Inst->eraseFromParent();
371 break;
372 }
373 case Instruction::Store: {
374 Value *Ptr = Inst->getOperand(1);
375 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
376 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
377 Value *VecValue = Builder.CreateLoad(BitCast);
378 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
379 Inst->getOperand(0),
380 Index);
381 Builder.CreateStore(NewVecValue, BitCast);
382 Inst->eraseFromParent();
383 break;
384 }
385 case Instruction::BitCast:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000386 case Instruction::AddrSpaceCast:
Tom Stellard880a80a2014-06-17 16:53:14 +0000387 break;
388
389 default:
390 Inst->dump();
Matt Arsenault642d2e72014-06-27 16:52:49 +0000391 llvm_unreachable("Inconsistency in instructions promotable to vector");
Tom Stellard880a80a2014-06-17 16:53:14 +0000392 }
393 }
394 return true;
395}
396
Matt Arsenaultad134842016-02-02 19:18:53 +0000397static bool isCallPromotable(CallInst *CI) {
398 // TODO: We might be able to handle some cases where the callee is a
399 // constantexpr bitcast of a function.
400 if (!CI->getCalledFunction())
401 return false;
402
403 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
404 if (!II)
405 return false;
406
407 switch (II->getIntrinsicID()) {
408 case Intrinsic::memcpy:
Matt Arsenault7e747f12016-02-02 20:28:10 +0000409 case Intrinsic::memmove:
Matt Arsenaultad134842016-02-02 19:18:53 +0000410 case Intrinsic::memset:
411 case Intrinsic::lifetime_start:
412 case Intrinsic::lifetime_end:
413 case Intrinsic::invariant_start:
414 case Intrinsic::invariant_end:
415 case Intrinsic::invariant_group_barrier:
Matt Arsenault7e747f12016-02-02 20:28:10 +0000416 case Intrinsic::objectsize:
Matt Arsenaultad134842016-02-02 19:18:53 +0000417 return true;
418 default:
419 return false;
420 }
421}
422
Tom Stellard5b2927f2014-10-31 20:52:04 +0000423static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000424 for (User *User : Val->users()) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000425 if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
Tom Stellard880a80a2014-06-17 16:53:14 +0000426 continue;
Matt Arsenaultad134842016-02-02 19:18:53 +0000427
Matt Arsenaultfdcd39a2015-07-28 18:29:14 +0000428 if (CallInst *CI = dyn_cast<CallInst>(User)) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000429 if (!isCallPromotable(CI))
Matt Arsenaultfdcd39a2015-07-28 18:29:14 +0000430 return false;
431
Tom Stellard880a80a2014-06-17 16:53:14 +0000432 WorkList.push_back(User);
433 continue;
434 }
Tom Stellard5b2927f2014-10-31 20:52:04 +0000435
436 // FIXME: Correctly handle ptrtoint instructions.
437 Instruction *UseInst = dyn_cast<Instruction>(User);
438 if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
439 return false;
440
Matt Arsenault7227cc12015-07-28 18:47:00 +0000441 if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
442 // Reject if the stored value is not the pointer operand.
443 if (SI->getPointerOperand() != Val)
444 return false;
445 }
446
Tom Stellard880a80a2014-06-17 16:53:14 +0000447 if (!User->getType()->isPointerTy())
448 continue;
Tom Stellard5b2927f2014-10-31 20:52:04 +0000449
Tom Stellard880a80a2014-06-17 16:53:14 +0000450 WorkList.push_back(User);
Matt Arsenaultad134842016-02-02 19:18:53 +0000451 if (!collectUsesWithPtrTypes(User, WorkList))
452 return false;
Tom Stellard880a80a2014-06-17 16:53:14 +0000453 }
Matt Arsenaultad134842016-02-02 19:18:53 +0000454
455 return true;
Tom Stellard880a80a2014-06-17 16:53:14 +0000456}
457
458void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
Matt Arsenault19c54882015-08-26 18:37:13 +0000459 if (!I.isStaticAlloca())
460 return;
461
Tom Stellard880a80a2014-06-17 16:53:14 +0000462 IRBuilder<> Builder(&I);
463
464 // First try to replace the alloca with a vector
465 Type *AllocaTy = I.getAllocatedType();
466
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000467 DEBUG(dbgs() << "Trying to promote " << I << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000468
469 if (tryPromoteAllocaToVector(&I))
470 return;
471
472 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
473
474 // FIXME: This is the maximum work group size. We should try to get
475 // value from the reqd_work_group_size function attribute if it is
476 // available.
477 unsigned WorkGroupSize = 256;
Mehdi Amini46a43552015-03-04 18:43:29 +0000478 int AllocaSize =
479 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
Tom Stellard880a80a2014-06-17 16:53:14 +0000480
481 if (AllocaSize > LocalMemAvailable) {
482 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
483 return;
484 }
485
Tom Stellard5b2927f2014-10-31 20:52:04 +0000486 std::vector<Value*> WorkList;
487
488 if (!collectUsesWithPtrTypes(&I, WorkList)) {
489 DEBUG(dbgs() << " Do not know how to convert all uses\n");
490 return;
491 }
492
Tom Stellard880a80a2014-06-17 16:53:14 +0000493 DEBUG(dbgs() << "Promoting alloca to local memory\n");
494 LocalMemAvailable -= AllocaSize;
495
David Blaikie156d46e2015-03-24 23:34:31 +0000496 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
Tom Stellard880a80a2014-06-17 16:53:14 +0000497 GlobalVariable *GV = new GlobalVariable(
David Blaikie156d46e2015-03-24 23:34:31 +0000498 *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
Tom Stellard880a80a2014-06-17 16:53:14 +0000499 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
500
Matt Arsenaulte0132462016-01-30 05:19:45 +0000501 Value *TCntY, *TCntZ;
Tom Stellard880a80a2014-06-17 16:53:14 +0000502
Matt Arsenaulte0132462016-01-30 05:19:45 +0000503 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
504 Value *TIdX = getWorkitemID(Builder, 0);
505 Value *TIdY = getWorkitemID(Builder, 1);
506 Value *TIdZ = getWorkitemID(Builder, 2);
Tom Stellard880a80a2014-06-17 16:53:14 +0000507
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000508 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000509 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000510 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000511 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
512 TID = Builder.CreateAdd(TID, TIdZ);
513
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000514 Value *Indices[] = {
515 Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
516 TID
517 };
Tom Stellard880a80a2014-06-17 16:53:14 +0000518
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000519 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
Tom Stellard880a80a2014-06-17 16:53:14 +0000520 I.mutateType(Offset->getType());
521 I.replaceAllUsesWith(Offset);
522 I.eraseFromParent();
523
Matt Arsenaultfb8cdba2016-02-02 19:32:35 +0000524 for (Value *V : WorkList) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000525 CallInst *Call = dyn_cast<CallInst>(V);
526 if (!Call) {
527 Type *EltTy = V->getType()->getPointerElementType();
528 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenault65f67e42014-09-15 15:41:44 +0000529
530 // The operand's value should be corrected on its own.
531 if (isa<AddrSpaceCastInst>(V))
532 continue;
533
534 // FIXME: It doesn't really make sense to try to do this for all
535 // instructions.
Tom Stellard880a80a2014-06-17 16:53:14 +0000536 V->mutateType(NewTy);
537 continue;
538 }
539
540 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
541 if (!Intr) {
Matt Arsenaultad134842016-02-02 19:18:53 +0000542 // FIXME: What is this for? It doesn't make sense to promote arbitrary
543 // function calls. If the call is to a defined function that can also be
544 // promoted, we should be able to do this once that function is also
545 // rewritten.
546
Tom Stellard880a80a2014-06-17 16:53:14 +0000547 std::vector<Type*> ArgTypes;
548 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
549 ArgIdx != ArgEnd; ++ArgIdx) {
550 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
551 }
552 Function *F = Call->getCalledFunction();
553 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
554 F->isVarArg());
Yaron Keren75e0c4b2015-03-27 17:51:30 +0000555 Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
556 NewType, F->getAttributes());
Tom Stellard880a80a2014-06-17 16:53:14 +0000557 Function *NewF = cast<Function>(C);
558 Call->setCalledFunction(NewF);
559 continue;
560 }
561
562 Builder.SetInsertPoint(Intr);
563 switch (Intr->getIntrinsicID()) {
564 case Intrinsic::lifetime_start:
565 case Intrinsic::lifetime_end:
566 // These intrinsics are for address space 0 only
567 Intr->eraseFromParent();
568 continue;
569 case Intrinsic::memcpy: {
570 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
571 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000572 MemCpy->getLength(), MemCpy->getAlignment(),
573 MemCpy->isVolatile());
Tom Stellard880a80a2014-06-17 16:53:14 +0000574 Intr->eraseFromParent();
575 continue;
576 }
Matt Arsenault7e747f12016-02-02 20:28:10 +0000577 case Intrinsic::memmove: {
578 MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
579 Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
580 MemMove->getLength(), MemMove->getAlignment(),
581 MemMove->isVolatile());
582 Intr->eraseFromParent();
583 continue;
584 }
Tom Stellard880a80a2014-06-17 16:53:14 +0000585 case Intrinsic::memset: {
586 MemSetInst *MemSet = cast<MemSetInst>(Intr);
587 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000588 MemSet->getLength(), MemSet->getAlignment(),
Tom Stellard880a80a2014-06-17 16:53:14 +0000589 MemSet->isVolatile());
590 Intr->eraseFromParent();
591 continue;
592 }
Matt Arsenault0b783ef02016-01-22 19:47:54 +0000593 case Intrinsic::invariant_start:
594 case Intrinsic::invariant_end:
595 case Intrinsic::invariant_group_barrier:
596 Intr->eraseFromParent();
597 // FIXME: I think the invariant marker should still theoretically apply,
598 // but the intrinsics need to be changed to accept pointers with any
599 // address space.
600 continue;
Matt Arsenault7e747f12016-02-02 20:28:10 +0000601 case Intrinsic::objectsize: {
602 Value *Src = Intr->getOperand(0);
603 Type *SrcTy = Src->getType()->getPointerElementType();
604 Function *ObjectSize = Intrinsic::getDeclaration(Mod,
605 Intrinsic::objectsize,
606 { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
607 );
608
609 CallInst *NewCall
610 = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
611 Intr->replaceAllUsesWith(NewCall);
612 Intr->eraseFromParent();
613 continue;
614 }
Tom Stellard880a80a2014-06-17 16:53:14 +0000615 default:
616 Intr->dump();
617 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
618 }
619 }
620}
621
Matt Arsenaulte0132462016-01-30 05:19:45 +0000622FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
623 return new AMDGPUPromoteAlloca(TM);
Tom Stellard880a80a2014-06-17 16:53:14 +0000624}