blob: cc09ced8f6e918ab1859485a4e21739f96c673ee [file] [log] [blame]
Tom Stellard880a80a2014-06-17 16:53:14 +00001//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass eliminates allocas by either converting them into vectors or
11// by migrating them to local address space.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/InstVisitor.h"
Matt Arsenaulte0132462016-01-30 05:19:45 +000020#include "llvm/IR/MDBuilder.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000021#include "llvm/Support/Debug.h"
Benjamin Kramer16132e62015-03-23 18:07:13 +000022#include "llvm/Support/raw_ostream.h"
Tom Stellard880a80a2014-06-17 16:53:14 +000023
24#define DEBUG_TYPE "amdgpu-promote-alloca"
25
26using namespace llvm;
27
28namespace {
29
Matt Arsenaulte0132462016-01-30 05:19:45 +000030// FIXME: This can create globals so should be a module pass.
Tom Stellard880a80a2014-06-17 16:53:14 +000031class AMDGPUPromoteAlloca : public FunctionPass,
Matt Arsenaulte0132462016-01-30 05:19:45 +000032 public InstVisitor<AMDGPUPromoteAlloca> {
33private:
34 const TargetMachine *TM;
Tom Stellard880a80a2014-06-17 16:53:14 +000035 Module *Mod;
Matt Arsenaulte0132462016-01-30 05:19:45 +000036 MDNode *MaxWorkGroupSizeRange;
37
38 // FIXME: This should be per-kernel.
Tom Stellard880a80a2014-06-17 16:53:14 +000039 int LocalMemAvailable;
40
Matt Arsenaulte0132462016-01-30 05:19:45 +000041 bool IsAMDGCN;
42 bool IsAMDHSA;
43
44 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
45 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
46
Tom Stellard880a80a2014-06-17 16:53:14 +000047public:
Matt Arsenaulte0132462016-01-30 05:19:45 +000048 static char ID;
49
50 AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
51 FunctionPass(ID),
52 TM(TM_),
53 Mod(nullptr),
54 MaxWorkGroupSizeRange(nullptr),
55 LocalMemAvailable(0),
56 IsAMDGCN(false),
57 IsAMDHSA(false) { }
58
Benjamin Kramer8c90fd72014-09-03 11:41:21 +000059 bool doInitialization(Module &M) override;
60 bool runOnFunction(Function &F) override;
Matt Arsenaulte0132462016-01-30 05:19:45 +000061
62 const char *getPassName() const override {
63 return "AMDGPU Promote Alloca";
64 }
65
Tom Stellard880a80a2014-06-17 16:53:14 +000066 void visitAlloca(AllocaInst &I);
67};
68
69} // End anonymous namespace
70
71char AMDGPUPromoteAlloca::ID = 0;
72
Matt Arsenaulte0132462016-01-30 05:19:45 +000073INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
74 "AMDGPU promote alloca to vector or LDS", false, false)
75
76char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
77
78
Tom Stellard880a80a2014-06-17 16:53:14 +000079bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Matt Arsenaulte0132462016-01-30 05:19:45 +000080 if (!TM)
81 return false;
82
Tom Stellard880a80a2014-06-17 16:53:14 +000083 Mod = &M;
Matt Arsenaulte0132462016-01-30 05:19:45 +000084
85 // The maximum workitem id.
86 //
87 // FIXME: Should get as subtarget property. Usually runtime enforced max is
88 // 256.
89 MDBuilder MDB(Mod->getContext());
90 MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
91
92 const Triple &TT = TM->getTargetTriple();
93
94 IsAMDGCN = TT.getArch() == Triple::amdgcn;
95 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
96
Tom Stellard880a80a2014-06-17 16:53:14 +000097 return false;
98}
99
100bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
Matt Arsenaulte0132462016-01-30 05:19:45 +0000101 if (!TM)
102 return false;
103
104 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
Tom Stellard880a80a2014-06-17 16:53:14 +0000105
Craig Toppere3dcce92015-08-01 22:20:21 +0000106 FunctionType *FTy = F.getFunctionType();
Tom Stellard880a80a2014-06-17 16:53:14 +0000107 LocalMemAvailable = ST.getLocalMemorySize();
108
109
110 // If the function has any arguments in the local address space, then it's
111 // possible these arguments require the entire local memory space, so
112 // we cannot use local memory in the pass.
113 for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
Craig Toppere3dcce92015-08-01 22:20:21 +0000114 Type *ParamTy = FTy->getParamType(i);
Tom Stellard880a80a2014-06-17 16:53:14 +0000115 if (ParamTy->isPointerTy() &&
116 ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
117 LocalMemAvailable = 0;
118 DEBUG(dbgs() << "Function has local memory argument. Promoting to "
119 "local memory disabled.\n");
120 break;
121 }
122 }
123
124 if (LocalMemAvailable > 0) {
125 // Check how much local memory is being used by global objects
126 for (Module::global_iterator I = Mod->global_begin(),
127 E = Mod->global_end(); I != E; ++I) {
Duncan P. N. Exon Smitha73371a2015-10-13 20:07:10 +0000128 GlobalVariable *GV = &*I;
Manuel Jacob5f6eaac2016-01-16 20:30:46 +0000129 if (GV->getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
Tom Stellard880a80a2014-06-17 16:53:14 +0000130 continue;
131 for (Value::use_iterator U = GV->use_begin(),
132 UE = GV->use_end(); U != UE; ++U) {
133 Instruction *Use = dyn_cast<Instruction>(*U);
134 if (!Use)
135 continue;
136 if (Use->getParent()->getParent() == &F)
137 LocalMemAvailable -=
Manuel Jacob5f6eaac2016-01-16 20:30:46 +0000138 Mod->getDataLayout().getTypeAllocSize(GV->getValueType());
Tom Stellard880a80a2014-06-17 16:53:14 +0000139 }
140 }
141 }
142
143 LocalMemAvailable = std::max(0, LocalMemAvailable);
144 DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
145
146 visit(F);
147
148 return false;
149}
150
Matt Arsenaulte0132462016-01-30 05:19:45 +0000151std::pair<Value *, Value *>
152AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
153 if (!IsAMDHSA) {
154 Function *LocalSizeYFn
155 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
156 Function *LocalSizeZFn
157 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
158
159 CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
160 CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
161
162 LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
163 LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
164
165 return std::make_pair(LocalSizeY, LocalSizeZ);
166 }
167
168 // We must read the size out of the dispatch pointer.
169 assert(IsAMDGCN);
170
171 // We are indexing into this struct, and want to extract the workgroup_size_*
172 // fields.
173 //
174 // typedef struct hsa_kernel_dispatch_packet_s {
175 // uint16_t header;
176 // uint16_t setup;
177 // uint16_t workgroup_size_x ;
178 // uint16_t workgroup_size_y;
179 // uint16_t workgroup_size_z;
180 // uint16_t reserved0;
181 // uint32_t grid_size_x ;
182 // uint32_t grid_size_y ;
183 // uint32_t grid_size_z;
184 //
185 // uint32_t private_segment_size;
186 // uint32_t group_segment_size;
187 // uint64_t kernel_object;
188 //
189 // #ifdef HSA_LARGE_MODEL
190 // void *kernarg_address;
191 // #elif defined HSA_LITTLE_ENDIAN
192 // void *kernarg_address;
193 // uint32_t reserved1;
194 // #else
195 // uint32_t reserved1;
196 // void *kernarg_address;
197 // #endif
198 // uint64_t reserved2;
199 // hsa_signal_t completion_signal; // uint64_t wrapper
200 // } hsa_kernel_dispatch_packet_t
201 //
202 Function *DispatchPtrFn
203 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
204
205 CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
206 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
207 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
208
209 // Size of the dispatch packet struct.
210 DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
211
212 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
213 Value *CastDispatchPtr = Builder.CreateBitCast(
214 DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
215
216 // We could do a single 64-bit load here, but it's likely that the basic
217 // 32-bit and extract sequence is already present, and it is probably easier
218 // to CSE this. The loads should be mergable later anyway.
219 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
220 LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
221
222 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
223 LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
224
225 MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
226 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
227 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
228 LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
229
230 // Extract y component. Upper half of LoadZU should be zero already.
231 Value *Y = Builder.CreateLShr(LoadXY, 16);
232
233 return std::make_pair(Y, LoadZU);
234}
235
236Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
237 Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
238
239 switch (N) {
240 case 0:
241 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
242 : Intrinsic::r600_read_tidig_x;
243 break;
244 case 1:
245 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
246 : Intrinsic::r600_read_tidig_y;
247 break;
248
249 case 2:
250 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
251 : Intrinsic::r600_read_tidig_z;
252 break;
253 default:
254 llvm_unreachable("invalid dimension");
255 }
256
257 Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
258 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
259 CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
260
261 return CI;
262}
263
Craig Toppere3dcce92015-08-01 22:20:21 +0000264static VectorType *arrayTypeToVecType(Type *ArrayTy) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000265 return VectorType::get(ArrayTy->getArrayElementType(),
266 ArrayTy->getArrayNumElements());
267}
268
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000269static Value *
270calculateVectorIndex(Value *Ptr,
271 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000272 if (isa<AllocaInst>(Ptr))
273 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
274
275 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
276
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000277 auto I = GEPIdx.find(GEP);
278 return I == GEPIdx.end() ? nullptr : I->second;
Tom Stellard880a80a2014-06-17 16:53:14 +0000279}
280
281static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
282 // FIXME we only support simple cases
283 if (GEP->getNumOperands() != 3)
284 return NULL;
285
286 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
287 if (!I0 || !I0->isZero())
288 return NULL;
289
290 return GEP->getOperand(2);
291}
292
Matt Arsenault642d2e72014-06-27 16:52:49 +0000293// Not an instruction handled below to turn into a vector.
294//
295// TODO: Check isTriviallyVectorizable for calls and handle other
296// instructions.
Matt Arsenault7227cc12015-07-28 18:47:00 +0000297static bool canVectorizeInst(Instruction *Inst, User *User) {
Matt Arsenault642d2e72014-06-27 16:52:49 +0000298 switch (Inst->getOpcode()) {
299 case Instruction::Load:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000300 case Instruction::BitCast:
301 case Instruction::AddrSpaceCast:
302 return true;
Matt Arsenault7227cc12015-07-28 18:47:00 +0000303 case Instruction::Store: {
304 // Must be the stored pointer operand, not a stored value.
305 StoreInst *SI = cast<StoreInst>(Inst);
306 return SI->getPointerOperand() == User;
307 }
Matt Arsenault642d2e72014-06-27 16:52:49 +0000308 default:
309 return false;
310 }
311}
312
Tom Stellard880a80a2014-06-17 16:53:14 +0000313static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
314 Type *AllocaTy = Alloca->getAllocatedType();
315
316 DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
317
318 // FIXME: There is no reason why we can't support larger arrays, we
319 // are just being conservative for now.
320 if (!AllocaTy->isArrayTy() ||
321 AllocaTy->getArrayElementType()->isVectorTy() ||
322 AllocaTy->getArrayNumElements() > 4) {
323
324 DEBUG(dbgs() << " Cannot convert type to vector");
325 return false;
326 }
327
328 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
329 std::vector<Value*> WorkList;
330 for (User *AllocaUser : Alloca->users()) {
331 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
332 if (!GEP) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000333 if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000334 return false;
335
Tom Stellard880a80a2014-06-17 16:53:14 +0000336 WorkList.push_back(AllocaUser);
337 continue;
338 }
339
340 Value *Index = GEPToVectorIndex(GEP);
341
342 // If we can't compute a vector index from this GEP, then we can't
343 // promote this alloca to vector.
344 if (!Index) {
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000345 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000346 return false;
347 }
348
349 GEPVectorIdx[GEP] = Index;
350 for (User *GEPUser : AllocaUser->users()) {
Matt Arsenault7227cc12015-07-28 18:47:00 +0000351 if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
Matt Arsenault642d2e72014-06-27 16:52:49 +0000352 return false;
353
Tom Stellard880a80a2014-06-17 16:53:14 +0000354 WorkList.push_back(GEPUser);
355 }
356 }
357
358 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
359
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000360 DEBUG(dbgs() << " Converting alloca to vector "
361 << *AllocaTy << " -> " << *VectorTy << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000362
363 for (std::vector<Value*>::iterator I = WorkList.begin(),
364 E = WorkList.end(); I != E; ++I) {
365 Instruction *Inst = cast<Instruction>(*I);
366 IRBuilder<> Builder(Inst);
367 switch (Inst->getOpcode()) {
368 case Instruction::Load: {
369 Value *Ptr = Inst->getOperand(0);
370 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
371 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
372 Value *VecValue = Builder.CreateLoad(BitCast);
373 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
374 Inst->replaceAllUsesWith(ExtractElement);
375 Inst->eraseFromParent();
376 break;
377 }
378 case Instruction::Store: {
379 Value *Ptr = Inst->getOperand(1);
380 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
381 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
382 Value *VecValue = Builder.CreateLoad(BitCast);
383 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
384 Inst->getOperand(0),
385 Index);
386 Builder.CreateStore(NewVecValue, BitCast);
387 Inst->eraseFromParent();
388 break;
389 }
390 case Instruction::BitCast:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000391 case Instruction::AddrSpaceCast:
Tom Stellard880a80a2014-06-17 16:53:14 +0000392 break;
393
394 default:
395 Inst->dump();
Matt Arsenault642d2e72014-06-27 16:52:49 +0000396 llvm_unreachable("Inconsistency in instructions promotable to vector");
Tom Stellard880a80a2014-06-17 16:53:14 +0000397 }
398 }
399 return true;
400}
401
Tom Stellard5b2927f2014-10-31 20:52:04 +0000402static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
403 bool Success = true;
Tom Stellard880a80a2014-06-17 16:53:14 +0000404 for (User *User : Val->users()) {
405 if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
406 continue;
Matt Arsenaultfdcd39a2015-07-28 18:29:14 +0000407 if (CallInst *CI = dyn_cast<CallInst>(User)) {
408 // TODO: We might be able to handle some cases where the callee is a
409 // constantexpr bitcast of a function.
410 if (!CI->getCalledFunction())
411 return false;
412
Tom Stellard880a80a2014-06-17 16:53:14 +0000413 WorkList.push_back(User);
414 continue;
415 }
Tom Stellard5b2927f2014-10-31 20:52:04 +0000416
417 // FIXME: Correctly handle ptrtoint instructions.
418 Instruction *UseInst = dyn_cast<Instruction>(User);
419 if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
420 return false;
421
Matt Arsenault7227cc12015-07-28 18:47:00 +0000422 if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
423 // Reject if the stored value is not the pointer operand.
424 if (SI->getPointerOperand() != Val)
425 return false;
426 }
427
Tom Stellard880a80a2014-06-17 16:53:14 +0000428 if (!User->getType()->isPointerTy())
429 continue;
Tom Stellard5b2927f2014-10-31 20:52:04 +0000430
Tom Stellard880a80a2014-06-17 16:53:14 +0000431 WorkList.push_back(User);
Tom Stellard5b2927f2014-10-31 20:52:04 +0000432
433 Success &= collectUsesWithPtrTypes(User, WorkList);
Tom Stellard880a80a2014-06-17 16:53:14 +0000434 }
Tom Stellard5b2927f2014-10-31 20:52:04 +0000435 return Success;
Tom Stellard880a80a2014-06-17 16:53:14 +0000436}
437
438void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
Matt Arsenault19c54882015-08-26 18:37:13 +0000439 if (!I.isStaticAlloca())
440 return;
441
Tom Stellard880a80a2014-06-17 16:53:14 +0000442 IRBuilder<> Builder(&I);
443
444 // First try to replace the alloca with a vector
445 Type *AllocaTy = I.getAllocatedType();
446
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000447 DEBUG(dbgs() << "Trying to promote " << I << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000448
449 if (tryPromoteAllocaToVector(&I))
450 return;
451
452 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
453
454 // FIXME: This is the maximum work group size. We should try to get
455 // value from the reqd_work_group_size function attribute if it is
456 // available.
457 unsigned WorkGroupSize = 256;
Mehdi Amini46a43552015-03-04 18:43:29 +0000458 int AllocaSize =
459 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
Tom Stellard880a80a2014-06-17 16:53:14 +0000460
461 if (AllocaSize > LocalMemAvailable) {
462 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
463 return;
464 }
465
Tom Stellard5b2927f2014-10-31 20:52:04 +0000466 std::vector<Value*> WorkList;
467
468 if (!collectUsesWithPtrTypes(&I, WorkList)) {
469 DEBUG(dbgs() << " Do not know how to convert all uses\n");
470 return;
471 }
472
Tom Stellard880a80a2014-06-17 16:53:14 +0000473 DEBUG(dbgs() << "Promoting alloca to local memory\n");
474 LocalMemAvailable -= AllocaSize;
475
David Blaikie156d46e2015-03-24 23:34:31 +0000476 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
Tom Stellard880a80a2014-06-17 16:53:14 +0000477 GlobalVariable *GV = new GlobalVariable(
David Blaikie156d46e2015-03-24 23:34:31 +0000478 *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
Tom Stellard880a80a2014-06-17 16:53:14 +0000479 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
480
Matt Arsenaulte0132462016-01-30 05:19:45 +0000481 Value *TCntY, *TCntZ;
Tom Stellard880a80a2014-06-17 16:53:14 +0000482
Matt Arsenaulte0132462016-01-30 05:19:45 +0000483 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
484 Value *TIdX = getWorkitemID(Builder, 0);
485 Value *TIdY = getWorkitemID(Builder, 1);
486 Value *TIdZ = getWorkitemID(Builder, 2);
Tom Stellard880a80a2014-06-17 16:53:14 +0000487
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000488 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000489 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000490 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
Tom Stellard880a80a2014-06-17 16:53:14 +0000491 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
492 TID = Builder.CreateAdd(TID, TIdZ);
493
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000494 Value *Indices[] = {
495 Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
496 TID
497 };
Tom Stellard880a80a2014-06-17 16:53:14 +0000498
Matt Arsenault853a1fc2016-02-02 19:18:48 +0000499 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
Tom Stellard880a80a2014-06-17 16:53:14 +0000500 I.mutateType(Offset->getType());
501 I.replaceAllUsesWith(Offset);
502 I.eraseFromParent();
503
Tom Stellard880a80a2014-06-17 16:53:14 +0000504 for (std::vector<Value*>::iterator i = WorkList.begin(),
505 e = WorkList.end(); i != e; ++i) {
506 Value *V = *i;
507 CallInst *Call = dyn_cast<CallInst>(V);
508 if (!Call) {
509 Type *EltTy = V->getType()->getPointerElementType();
510 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenault65f67e42014-09-15 15:41:44 +0000511
512 // The operand's value should be corrected on its own.
513 if (isa<AddrSpaceCastInst>(V))
514 continue;
515
516 // FIXME: It doesn't really make sense to try to do this for all
517 // instructions.
Tom Stellard880a80a2014-06-17 16:53:14 +0000518 V->mutateType(NewTy);
519 continue;
520 }
521
522 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
523 if (!Intr) {
524 std::vector<Type*> ArgTypes;
525 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
526 ArgIdx != ArgEnd; ++ArgIdx) {
527 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
528 }
529 Function *F = Call->getCalledFunction();
530 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
531 F->isVarArg());
Yaron Keren75e0c4b2015-03-27 17:51:30 +0000532 Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
533 NewType, F->getAttributes());
Tom Stellard880a80a2014-06-17 16:53:14 +0000534 Function *NewF = cast<Function>(C);
535 Call->setCalledFunction(NewF);
536 continue;
537 }
538
539 Builder.SetInsertPoint(Intr);
540 switch (Intr->getIntrinsicID()) {
541 case Intrinsic::lifetime_start:
542 case Intrinsic::lifetime_end:
543 // These intrinsics are for address space 0 only
544 Intr->eraseFromParent();
545 continue;
546 case Intrinsic::memcpy: {
547 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
548 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000549 MemCpy->getLength(), MemCpy->getAlignment(),
550 MemCpy->isVolatile());
Tom Stellard880a80a2014-06-17 16:53:14 +0000551 Intr->eraseFromParent();
552 continue;
553 }
554 case Intrinsic::memset: {
555 MemSetInst *MemSet = cast<MemSetInst>(Intr);
556 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
Pete Cooper67cf9a72015-11-19 05:56:52 +0000557 MemSet->getLength(), MemSet->getAlignment(),
Tom Stellard880a80a2014-06-17 16:53:14 +0000558 MemSet->isVolatile());
559 Intr->eraseFromParent();
560 continue;
561 }
Matt Arsenault0b783ef02016-01-22 19:47:54 +0000562 case Intrinsic::invariant_start:
563 case Intrinsic::invariant_end:
564 case Intrinsic::invariant_group_barrier:
565 Intr->eraseFromParent();
566 // FIXME: I think the invariant marker should still theoretically apply,
567 // but the intrinsics need to be changed to accept pointers with any
568 // address space.
569 continue;
Tom Stellard880a80a2014-06-17 16:53:14 +0000570 default:
571 Intr->dump();
572 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
573 }
574 }
575}
576
Matt Arsenaulte0132462016-01-30 05:19:45 +0000577FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
578 return new AMDGPUPromoteAlloca(TM);
Tom Stellard880a80a2014-06-17 16:53:14 +0000579}