blob: 8e85b03d4eb1e76574ae601b3cd2ee577a399c57 [file] [log] [blame]
Tom Stellard880a80a2014-06-17 16:53:14 +00001//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This pass eliminates allocas by either converting them into vectors or
11// by migrating them to local address space.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "llvm/Analysis/ValueTracking.h"
18#include "llvm/IR/IRBuilder.h"
19#include "llvm/IR/InstVisitor.h"
20#include "llvm/Support/Debug.h"
21
22#define DEBUG_TYPE "amdgpu-promote-alloca"
23
24using namespace llvm;
25
26namespace {
27
28class AMDGPUPromoteAlloca : public FunctionPass,
29 public InstVisitor<AMDGPUPromoteAlloca> {
30
31 static char ID;
32 Module *Mod;
33 const AMDGPUSubtarget &ST;
34 int LocalMemAvailable;
35
36public:
37 AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
38 LocalMemAvailable(0) { }
Benjamin Kramer8c90fd72014-09-03 11:41:21 +000039 bool doInitialization(Module &M) override;
40 bool runOnFunction(Function &F) override;
41 const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
Tom Stellard880a80a2014-06-17 16:53:14 +000042 void visitAlloca(AllocaInst &I);
43};
44
45} // End anonymous namespace
46
47char AMDGPUPromoteAlloca::ID = 0;
48
49bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
50 Mod = &M;
51 return false;
52}
53
54bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
55
56 const FunctionType *FTy = F.getFunctionType();
57
58 LocalMemAvailable = ST.getLocalMemorySize();
59
60
61 // If the function has any arguments in the local address space, then it's
62 // possible these arguments require the entire local memory space, so
63 // we cannot use local memory in the pass.
64 for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
65 const Type *ParamTy = FTy->getParamType(i);
66 if (ParamTy->isPointerTy() &&
67 ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
68 LocalMemAvailable = 0;
69 DEBUG(dbgs() << "Function has local memory argument. Promoting to "
70 "local memory disabled.\n");
71 break;
72 }
73 }
74
75 if (LocalMemAvailable > 0) {
76 // Check how much local memory is being used by global objects
77 for (Module::global_iterator I = Mod->global_begin(),
78 E = Mod->global_end(); I != E; ++I) {
79 GlobalVariable *GV = I;
80 PointerType *GVTy = GV->getType();
81 if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
82 continue;
83 for (Value::use_iterator U = GV->use_begin(),
84 UE = GV->use_end(); U != UE; ++U) {
85 Instruction *Use = dyn_cast<Instruction>(*U);
86 if (!Use)
87 continue;
88 if (Use->getParent()->getParent() == &F)
89 LocalMemAvailable -=
90 Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
91 }
92 }
93 }
94
95 LocalMemAvailable = std::max(0, LocalMemAvailable);
96 DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
97
98 visit(F);
99
100 return false;
101}
102
103static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
104 return VectorType::get(ArrayTy->getArrayElementType(),
105 ArrayTy->getArrayNumElements());
106}
107
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000108static Value *
109calculateVectorIndex(Value *Ptr,
110 const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
Tom Stellard880a80a2014-06-17 16:53:14 +0000111 if (isa<AllocaInst>(Ptr))
112 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
113
114 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
115
Benjamin Kramerc6cc58e2014-10-04 16:55:56 +0000116 auto I = GEPIdx.find(GEP);
117 return I == GEPIdx.end() ? nullptr : I->second;
Tom Stellard880a80a2014-06-17 16:53:14 +0000118}
119
120static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
121 // FIXME we only support simple cases
122 if (GEP->getNumOperands() != 3)
123 return NULL;
124
125 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
126 if (!I0 || !I0->isZero())
127 return NULL;
128
129 return GEP->getOperand(2);
130}
131
Matt Arsenault642d2e72014-06-27 16:52:49 +0000132// Not an instruction handled below to turn into a vector.
133//
134// TODO: Check isTriviallyVectorizable for calls and handle other
135// instructions.
136static bool canVectorizeInst(Instruction *Inst) {
137 switch (Inst->getOpcode()) {
138 case Instruction::Load:
139 case Instruction::Store:
140 case Instruction::BitCast:
141 case Instruction::AddrSpaceCast:
142 return true;
143 default:
144 return false;
145 }
146}
147
Tom Stellard880a80a2014-06-17 16:53:14 +0000148static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
149 Type *AllocaTy = Alloca->getAllocatedType();
150
151 DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
152
153 // FIXME: There is no reason why we can't support larger arrays, we
154 // are just being conservative for now.
155 if (!AllocaTy->isArrayTy() ||
156 AllocaTy->getArrayElementType()->isVectorTy() ||
157 AllocaTy->getArrayNumElements() > 4) {
158
159 DEBUG(dbgs() << " Cannot convert type to vector");
160 return false;
161 }
162
163 std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
164 std::vector<Value*> WorkList;
165 for (User *AllocaUser : Alloca->users()) {
166 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
167 if (!GEP) {
Matt Arsenault642d2e72014-06-27 16:52:49 +0000168 if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
169 return false;
170
Tom Stellard880a80a2014-06-17 16:53:14 +0000171 WorkList.push_back(AllocaUser);
172 continue;
173 }
174
175 Value *Index = GEPToVectorIndex(GEP);
176
177 // If we can't compute a vector index from this GEP, then we can't
178 // promote this alloca to vector.
179 if (!Index) {
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000180 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000181 return false;
182 }
183
184 GEPVectorIdx[GEP] = Index;
185 for (User *GEPUser : AllocaUser->users()) {
Matt Arsenault642d2e72014-06-27 16:52:49 +0000186 if (!canVectorizeInst(cast<Instruction>(GEPUser)))
187 return false;
188
Tom Stellard880a80a2014-06-17 16:53:14 +0000189 WorkList.push_back(GEPUser);
190 }
191 }
192
193 VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
194
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000195 DEBUG(dbgs() << " Converting alloca to vector "
196 << *AllocaTy << " -> " << *VectorTy << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000197
198 for (std::vector<Value*>::iterator I = WorkList.begin(),
199 E = WorkList.end(); I != E; ++I) {
200 Instruction *Inst = cast<Instruction>(*I);
201 IRBuilder<> Builder(Inst);
202 switch (Inst->getOpcode()) {
203 case Instruction::Load: {
204 Value *Ptr = Inst->getOperand(0);
205 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
206 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
207 Value *VecValue = Builder.CreateLoad(BitCast);
208 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
209 Inst->replaceAllUsesWith(ExtractElement);
210 Inst->eraseFromParent();
211 break;
212 }
213 case Instruction::Store: {
214 Value *Ptr = Inst->getOperand(1);
215 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
216 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
217 Value *VecValue = Builder.CreateLoad(BitCast);
218 Value *NewVecValue = Builder.CreateInsertElement(VecValue,
219 Inst->getOperand(0),
220 Index);
221 Builder.CreateStore(NewVecValue, BitCast);
222 Inst->eraseFromParent();
223 break;
224 }
225 case Instruction::BitCast:
Matt Arsenault642d2e72014-06-27 16:52:49 +0000226 case Instruction::AddrSpaceCast:
Tom Stellard880a80a2014-06-17 16:53:14 +0000227 break;
228
229 default:
230 Inst->dump();
Matt Arsenault642d2e72014-06-27 16:52:49 +0000231 llvm_unreachable("Inconsistency in instructions promotable to vector");
Tom Stellard880a80a2014-06-17 16:53:14 +0000232 }
233 }
234 return true;
235}
236
237static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
238 for (User *User : Val->users()) {
239 if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
240 continue;
241 if (isa<CallInst>(User)) {
242 WorkList.push_back(User);
243 continue;
244 }
245 if (!User->getType()->isPointerTy())
246 continue;
247 WorkList.push_back(User);
248 collectUsesWithPtrTypes(User, WorkList);
249 }
250}
251
252void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
253 IRBuilder<> Builder(&I);
254
255 // First try to replace the alloca with a vector
256 Type *AllocaTy = I.getAllocatedType();
257
Matt Arsenault6f62cf82014-06-27 02:36:59 +0000258 DEBUG(dbgs() << "Trying to promote " << I << '\n');
Tom Stellard880a80a2014-06-17 16:53:14 +0000259
260 if (tryPromoteAllocaToVector(&I))
261 return;
262
263 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
264
265 // FIXME: This is the maximum work group size. We should try to get
266 // value from the reqd_work_group_size function attribute if it is
267 // available.
268 unsigned WorkGroupSize = 256;
269 int AllocaSize = WorkGroupSize *
270 Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
271
272 if (AllocaSize > LocalMemAvailable) {
273 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
274 return;
275 }
276
277 DEBUG(dbgs() << "Promoting alloca to local memory\n");
278 LocalMemAvailable -= AllocaSize;
279
280 GlobalVariable *GV = new GlobalVariable(
281 *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
282 GlobalValue::ExternalLinkage, 0, I.getName(), 0,
283 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
284
285 FunctionType *FTy = FunctionType::get(
286 Type::getInt32Ty(Mod->getContext()), false);
287 AttributeSet AttrSet;
288 AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
289
290 Value *ReadLocalSizeY = Mod->getOrInsertFunction(
291 "llvm.r600.read.local.size.y", FTy, AttrSet);
292 Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
293 "llvm.r600.read.local.size.z", FTy, AttrSet);
294 Value *ReadTIDIGX = Mod->getOrInsertFunction(
295 "llvm.r600.read.tidig.x", FTy, AttrSet);
296 Value *ReadTIDIGY = Mod->getOrInsertFunction(
297 "llvm.r600.read.tidig.y", FTy, AttrSet);
298 Value *ReadTIDIGZ = Mod->getOrInsertFunction(
299 "llvm.r600.read.tidig.z", FTy, AttrSet);
300
301
302 Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
303 Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
304 Value *TIdX = Builder.CreateCall(ReadTIDIGX);
305 Value *TIdY = Builder.CreateCall(ReadTIDIGY);
306 Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
307
308 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
309 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
310 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
311 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
312 TID = Builder.CreateAdd(TID, TIdZ);
313
314 std::vector<Value*> Indices;
315 Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
316 Indices.push_back(TID);
317
318 Value *Offset = Builder.CreateGEP(GV, Indices);
319 I.mutateType(Offset->getType());
320 I.replaceAllUsesWith(Offset);
321 I.eraseFromParent();
322
323 std::vector<Value*> WorkList;
324
325 collectUsesWithPtrTypes(Offset, WorkList);
326
327 for (std::vector<Value*>::iterator i = WorkList.begin(),
328 e = WorkList.end(); i != e; ++i) {
329 Value *V = *i;
330 CallInst *Call = dyn_cast<CallInst>(V);
331 if (!Call) {
332 Type *EltTy = V->getType()->getPointerElementType();
333 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
Matt Arsenault65f67e42014-09-15 15:41:44 +0000334
335 // The operand's value should be corrected on its own.
336 if (isa<AddrSpaceCastInst>(V))
337 continue;
338
339 // FIXME: It doesn't really make sense to try to do this for all
340 // instructions.
Tom Stellard880a80a2014-06-17 16:53:14 +0000341 V->mutateType(NewTy);
342 continue;
343 }
344
345 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
346 if (!Intr) {
347 std::vector<Type*> ArgTypes;
348 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
349 ArgIdx != ArgEnd; ++ArgIdx) {
350 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
351 }
352 Function *F = Call->getCalledFunction();
353 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
354 F->isVarArg());
355 Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
356 F->getAttributes());
357 Function *NewF = cast<Function>(C);
358 Call->setCalledFunction(NewF);
359 continue;
360 }
361
362 Builder.SetInsertPoint(Intr);
363 switch (Intr->getIntrinsicID()) {
364 case Intrinsic::lifetime_start:
365 case Intrinsic::lifetime_end:
366 // These intrinsics are for address space 0 only
367 Intr->eraseFromParent();
368 continue;
369 case Intrinsic::memcpy: {
370 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
371 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
372 MemCpy->getLength(), MemCpy->getAlignment(),
373 MemCpy->isVolatile());
374 Intr->eraseFromParent();
375 continue;
376 }
377 case Intrinsic::memset: {
378 MemSetInst *MemSet = cast<MemSetInst>(Intr);
379 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
380 MemSet->getLength(), MemSet->getAlignment(),
381 MemSet->isVolatile());
382 Intr->eraseFromParent();
383 continue;
384 }
385 default:
386 Intr->dump();
387 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
388 }
389 }
390}
391
392FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
393 return new AMDGPUPromoteAlloca(ST);
394}