blob: 6304098639cf4bf6ac9816bb539e6de2ee5ab6fe [file] [log] [blame]
Matt Arsenault86de4862016-06-24 07:07:55 +00001//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// This pass does misc. AMDGPU optimizations on IR before instruction
12/// selection.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000017#include "AMDGPUIntrinsicInfo.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000018#include "AMDGPUSubtarget.h"
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000019#include "AMDGPUTargetMachine.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000020
21#include "llvm/Analysis/DivergenceAnalysis.h"
22#include "llvm/CodeGen/Passes.h"
23#include "llvm/IR/InstVisitor.h"
24#include "llvm/IR/IRBuilder.h"
25#include "llvm/Support/Debug.h"
26#include "llvm/Support/raw_ostream.h"
27
28#define DEBUG_TYPE "amdgpu-codegenprepare"
29
30using namespace llvm;
31
32namespace {
33
34class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000035 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
36 const GCNTargetMachine *TM;
37 const SISubtarget *ST;
Matt Arsenault86de4862016-06-24 07:07:55 +000038 DivergenceAnalysis *DA;
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000039 Module *Mod;
40 bool HasUnsafeFPMath;
Matt Arsenault86de4862016-06-24 07:07:55 +000041
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000042 /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
43 /// binary operator \p V.
44 ///
45 /// \returns Binary operator \p V.
46 Value *copyFlags(const BinaryOperator &I, Value *V) const;
47
48 /// \returns Equivalent 16 bit integer type for given 32 bit integer type
49 /// \p T.
50 Type *getI16Ty(IRBuilder<> &B, const Type *T) const;
51
52 /// \returns Equivalent 32 bit integer type for given 16 bit integer type
53 /// \p T.
54 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
55
56 /// \returns True if the base element of type \p T is 16 bit integer, false
57 /// otherwise.
58 bool isI16Ty(const Type *T) const;
59
60 /// \returns True if the base element of type \p T is 32 bit integer, false
61 /// otherwise.
62 bool isI32Ty(const Type *T) const;
63
64 /// \returns True if binary operation \p I is a signed binary operation, false
65 /// otherwise.
66 bool isSigned(const BinaryOperator &I) const;
67
68 /// \returns True if the condition of 'select' operation \p I comes from a
69 /// signed 'icmp' operation, false otherwise.
70 bool isSigned(const SelectInst &I) const;
71
72 /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
73 /// binary operation by sign or zero extending operands to 32 bits, replacing
74 /// 16 bit operation with equivalent 32 bit operation, and truncating the
75 /// result of 32 bit operation back to 16 bits. 16 bit division operation is
76 /// not promoted.
77 ///
78 /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
79 /// binary operation, false otherwise.
80 bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
81
82 /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
83 /// operation by sign or zero extending operands to 32 bits, and replacing 16
84 /// bit operation with 32 bit operation.
85 ///
86 /// \returns True.
87 bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
88
89 /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
90 /// operation by sign or zero extending operands to 32 bits, replacing 16 bit
91 /// operation with 32 bit operation, and truncating the result of 32 bit
92 /// operation back to 16 bits.
93 ///
94 /// \returns True.
95 bool promoteUniformI16OpToI32Op(SelectInst &I) const;
96
Matt Arsenault86de4862016-06-24 07:07:55 +000097public:
98 static char ID;
99 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
100 FunctionPass(ID),
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000101 TM(static_cast<const GCNTargetMachine *>(TM)),
102 ST(nullptr),
103 DA(nullptr),
104 Mod(nullptr),
105 HasUnsafeFPMath(false) { }
106
107 bool visitFDiv(BinaryOperator &I);
108
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000109 bool visitInstruction(Instruction &I) { return false; }
110 bool visitBinaryOperator(BinaryOperator &I);
111 bool visitICmpInst(ICmpInst &I);
112 bool visitSelectInst(SelectInst &I);
Matt Arsenault86de4862016-06-24 07:07:55 +0000113
114 bool doInitialization(Module &M) override;
115 bool runOnFunction(Function &F) override;
116
117 const char *getPassName() const override {
118 return "AMDGPU IR optimizations";
119 }
120
121 void getAnalysisUsage(AnalysisUsage &AU) const override {
122 AU.addRequired<DivergenceAnalysis>();
123 AU.setPreservesAll();
124 }
125};
126
127} // End anonymous namespace
128
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000129Value *AMDGPUCodeGenPrepare::copyFlags(
130 const BinaryOperator &I, Value *V) const {
131 assert(isa<BinaryOperator>(V) && "V must be binary operator");
132
133 BinaryOperator *BinOp = cast<BinaryOperator>(V);
134 if (isa<OverflowingBinaryOperator>(BinOp)) {
135 BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
136 BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
137 } else if (isa<PossiblyExactOperator>(BinOp)) {
138 BinOp->setIsExact(I.isExact());
139 }
140
141 return V;
142}
143
144Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const {
145 assert(isI32Ty(T) && "T must be 32 bits");
146
147 if (T->isIntegerTy())
148 return B.getInt16Ty();
149 return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
150}
151
152Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
153 assert(isI16Ty(T) && "T must be 16 bits");
154
155 if (T->isIntegerTy())
156 return B.getInt32Ty();
157 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
158}
159
160bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
161 if (T->isIntegerTy(16))
162 return true;
163 if (!T->isVectorTy())
164 return false;
165 return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
166}
167
168bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
169 if (T->isIntegerTy(32))
170 return true;
171 if (!T->isVectorTy())
172 return false;
173 return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
174}
175
176bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
177 return I.getOpcode() == Instruction::SDiv ||
178 I.getOpcode() == Instruction::SRem;
179}
180
181bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
182 return isa<ICmpInst>(I.getOperand(0)) ?
183 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
184}
185
186bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
187 assert(isI16Ty(I.getType()) && "Op must be 16 bits");
188
189 if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv)
190 return false;
191
192 IRBuilder<> Builder(&I);
193 Builder.SetCurrentDebugLocation(I.getDebugLoc());
194
195 Type *I32Ty = getI32Ty(Builder, I.getType());
196 Value *ExtOp0 = nullptr;
197 Value *ExtOp1 = nullptr;
198 Value *ExtRes = nullptr;
199 Value *TruncRes = nullptr;
200
201 if (isSigned(I)) {
202 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
203 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
204 } else {
205 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
206 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
207 }
208 ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
209 TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
210
211 I.replaceAllUsesWith(TruncRes);
212 I.eraseFromParent();
213
214 return true;
215}
216
217bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
218 assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
219 assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
220
221 IRBuilder<> Builder(&I);
222 Builder.SetCurrentDebugLocation(I.getDebugLoc());
223
224 Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
225 Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
226 Value *ExtOp0 = nullptr;
227 Value *ExtOp1 = nullptr;
228 Value *NewICmp = nullptr;
229
230 if (I.isSigned()) {
231 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
232 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
233 } else {
234 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
235 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
236 }
237 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
238
239 I.replaceAllUsesWith(NewICmp);
240 I.eraseFromParent();
241
242 return true;
243}
244
245bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
246 assert(isI16Ty(I.getType()) && "Op must be 16 bits");
247
248 IRBuilder<> Builder(&I);
249 Builder.SetCurrentDebugLocation(I.getDebugLoc());
250
251 Type *I32Ty = getI32Ty(Builder, I.getType());
252 Value *ExtOp1 = nullptr;
253 Value *ExtOp2 = nullptr;
254 Value *ExtRes = nullptr;
255 Value *TruncRes = nullptr;
256
257 if (isSigned(I)) {
258 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
259 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
260 } else {
261 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
262 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
263 }
264 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
265 TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
266
267 I.replaceAllUsesWith(TruncRes);
268 I.eraseFromParent();
269
270 return true;
271}
272
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000273static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
274 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
275 if (!CNum)
276 return false;
277
278 // Reciprocal f32 is handled separately without denormals.
Matt Arsenaulte3862cd2016-07-26 23:25:44 +0000279 return UnsafeDiv || CNum->isExactlyValue(+1.0);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000280}
281
282// Insert an intrinsic for fast fdiv for safe math situations where we can
283// reduce precision. Leave fdiv for situations where the generic node is
284// expected to be optimized.
285bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
286 Type *Ty = FDiv.getType();
287
288 // TODO: Handle half
289 if (!Ty->getScalarType()->isFloatTy())
290 return false;
291
292 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
293 if (!FPMath)
294 return false;
295
296 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
297 float ULP = FPOp->getFPAccuracy();
298 if (ULP < 2.5f)
299 return false;
300
301 FastMathFlags FMF = FPOp->getFastMathFlags();
302 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
303 FMF.allowReciprocal();
304 if (ST->hasFP32Denormals() && !UnsafeDiv)
305 return false;
306
307 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
308 Builder.setFastMathFlags(FMF);
309 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
310
311 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
312 Function *Decl
313 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
314
315 Value *Num = FDiv.getOperand(0);
316 Value *Den = FDiv.getOperand(1);
317
318 Value *NewFDiv = nullptr;
319
320 if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
321 NewFDiv = UndefValue::get(VT);
322
323 // FIXME: Doesn't do the right thing for cases where the vector is partially
324 // constant. This works when the scalarizer pass is run first.
325 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
326 Value *NumEltI = Builder.CreateExtractElement(Num, I);
327 Value *DenEltI = Builder.CreateExtractElement(Den, I);
328 Value *NewElt;
329
330 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
331 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
332 } else {
333 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
334 }
335
336 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
337 }
338 } else {
339 if (!shouldKeepFDivF32(Num, UnsafeDiv))
340 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
341 }
342
343 if (NewFDiv) {
344 FDiv.replaceAllUsesWith(NewFDiv);
345 NewFDiv->takeName(&FDiv);
346 FDiv.eraseFromParent();
347 }
348
349 return true;
350}
351
352static bool hasUnsafeFPMath(const Function &F) {
353 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
354 return Attr.getValueAsString() == "true";
355}
356
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000357bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
358 bool Changed = false;
359
360 // TODO: Should we promote smaller types that will be legalized to i16?
361 if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
362 Changed |= promoteUniformI16OpToI32Op(I);
363
364 return Changed;
365}
366
367bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
368 bool Changed = false;
369
370 // TODO: Should we promote smaller types that will be legalized to i16?
371 if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
372 isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
373 Changed |= promoteUniformI16OpToI32Op(I);
374
375 return Changed;
376}
377
378bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
379 bool Changed = false;
380
381 // TODO: Should we promote smaller types that will be legalized to i16?
382 if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
383 Changed |= promoteUniformI16OpToI32Op(I);
384
385 return Changed;
386}
387
Matt Arsenault86de4862016-06-24 07:07:55 +0000388bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000389 Mod = &M;
Matt Arsenault86de4862016-06-24 07:07:55 +0000390 return false;
391}
392
393bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
394 if (!TM || skipFunction(F))
395 return false;
396
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000397 ST = &TM->getSubtarget<SISubtarget>(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000398 DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000399 HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000400
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000401 bool MadeChange = false;
402
403 for (BasicBlock &BB : F) {
404 BasicBlock::iterator Next;
405 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
406 Next = std::next(I);
407 MadeChange |= visit(*I);
408 }
409 }
410
411 return MadeChange;
Matt Arsenault86de4862016-06-24 07:07:55 +0000412}
413
414INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
415 "AMDGPU IR optimizations", false, false)
416INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
417INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
418 "AMDGPU IR optimizations", false, false)
419
420char AMDGPUCodeGenPrepare::ID = 0;
421
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000422FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
Matt Arsenault86de4862016-06-24 07:07:55 +0000423 return new AMDGPUCodeGenPrepare(TM);
424}