blob: b17b67167666f07bf4fb76ea70ba74be028e6050 [file] [log] [blame]
Matt Arsenault86de4862016-06-24 07:07:55 +00001//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// This pass does misc. AMDGPU optimizations on IR before instruction
12/// selection.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUSubtarget.h"
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000018#include "AMDGPUTargetMachine.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000019#include "llvm/ADT/StringRef.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000020#include "llvm/Analysis/DivergenceAnalysis.h"
Wei Dinga126a132017-07-26 21:07:28 +000021#include "llvm/Analysis/Loads.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000022#include "llvm/CodeGen/Passes.h"
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +000023#include "llvm/CodeGen/TargetPassConfig.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000024#include "llvm/IR/Attributes.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Function.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000029#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstVisitor.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000031#include "llvm/IR/InstrTypes.h"
32#include "llvm/IR/Instruction.h"
33#include "llvm/IR/Instructions.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000034#include "llvm/IR/IntrinsicInst.h"
35#include "llvm/IR/Intrinsics.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000036#include "llvm/IR/LLVMContext.h"
37#include "llvm/IR/Operator.h"
38#include "llvm/IR/Type.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/Casting.h"
42#include <cassert>
43#include <iterator>
Matt Arsenault86de4862016-06-24 07:07:55 +000044
45#define DEBUG_TYPE "amdgpu-codegenprepare"
46
47using namespace llvm;
48
49namespace {
50
51class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000052 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000053 const SISubtarget *ST = nullptr;
54 DivergenceAnalysis *DA = nullptr;
55 Module *Mod = nullptr;
56 bool HasUnsafeFPMath = false;
Wei Dinga126a132017-07-26 21:07:28 +000057 AMDGPUAS AMDGPUASI;
Matt Arsenault86de4862016-06-24 07:07:55 +000058
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000059 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
60 /// binary operation \p V.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000061 ///
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000062 /// \returns Binary operation \p V.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000063 /// \returns \p T's base element bit width.
64 unsigned getBaseElementBitWidth(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000065
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000066 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
67 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
68 /// is returned.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000069 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
70
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000071 /// \returns True if binary operation \p I is a signed binary operation, false
72 /// otherwise.
73 bool isSigned(const BinaryOperator &I) const;
74
75 /// \returns True if the condition of 'select' operation \p I comes from a
76 /// signed 'icmp' operation, false otherwise.
77 bool isSigned(const SelectInst &I) const;
78
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000079 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
80 /// false otherwise.
81 bool needsPromotionToI32(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000082
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000083 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
84 /// operation.
85 ///
86 /// \details \p I's base element bit width must be greater than 1 and less
87 /// than or equal 16. Promotion is done by sign or zero extending operands to
88 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
89 /// truncating the result of 32 bit binary operation back to \p I's original
90 /// type. Division operation is not promoted.
91 ///
92 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
93 /// false otherwise.
94 bool promoteUniformOpToI32(BinaryOperator &I) const;
95
96 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
97 ///
98 /// \details \p I's base element bit width must be greater than 1 and less
99 /// than or equal 16. Promotion is done by sign or zero extending operands to
100 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000101 ///
102 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000103 bool promoteUniformOpToI32(ICmpInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000104
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000105 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
106 /// operation.
107 ///
108 /// \details \p I's base element bit width must be greater than 1 and less
109 /// than or equal 16. Promotion is done by sign or zero extending operands to
110 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
111 /// result of 32 bit 'select' operation back to \p I's original type.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000112 ///
113 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000114 bool promoteUniformOpToI32(SelectInst &I) const;
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000115
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000116 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
117 /// intrinsic.
118 ///
119 /// \details \p I's base element bit width must be greater than 1 and less
120 /// than or equal 16. Promotion is done by zero extending the operand to 32
121 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
122 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
123 /// shift amount is 32 minus \p I's base element bit width), and truncating
124 /// the result of the shift operation back to \p I's original type.
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000125 ///
126 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000127 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
Wei Dinga126a132017-07-26 21:07:28 +0000128 /// \brief Widen a scalar load.
129 ///
130 /// \details \p Widen scalar load for uniform, small type loads from constant
131 // memory / to a full 32-bits and then truncate the input to allow a scalar
132 // load instead of a vector load.
133 //
134 /// \returns True.
135
136 bool canWidenScalarExtLoad(LoadInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000137
Matt Arsenault86de4862016-06-24 07:07:55 +0000138public:
139 static char ID;
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000140
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000141 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000142
143 bool visitFDiv(BinaryOperator &I);
144
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000145 bool visitInstruction(Instruction &I) { return false; }
146 bool visitBinaryOperator(BinaryOperator &I);
Wei Dinga126a132017-07-26 21:07:28 +0000147 bool visitLoadInst(LoadInst &I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000148 bool visitICmpInst(ICmpInst &I);
149 bool visitSelectInst(SelectInst &I);
Matt Arsenault86de4862016-06-24 07:07:55 +0000150
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000151 bool visitIntrinsicInst(IntrinsicInst &I);
152 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
153
Matt Arsenault86de4862016-06-24 07:07:55 +0000154 bool doInitialization(Module &M) override;
155 bool runOnFunction(Function &F) override;
156
Mehdi Amini117296c2016-10-01 02:56:57 +0000157 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
Matt Arsenault86de4862016-06-24 07:07:55 +0000158
159 void getAnalysisUsage(AnalysisUsage &AU) const override {
160 AU.addRequired<DivergenceAnalysis>();
161 AU.setPreservesAll();
162 }
163};
164
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000165} // end anonymous namespace
Matt Arsenault86de4862016-06-24 07:07:55 +0000166
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000167unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
168 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000169
170 if (T->isIntegerTy())
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000171 return T->getIntegerBitWidth();
172 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000173}
174
175Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000176 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000177
178 if (T->isIntegerTy())
179 return B.getInt32Ty();
180 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
181}
182
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000183bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
Konstantin Zhuravlyov691e2e02016-10-03 18:29:01 +0000184 return I.getOpcode() == Instruction::AShr ||
185 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000186}
187
188bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
189 return isa<ICmpInst>(I.getOperand(0)) ?
190 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
191}
192
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000193bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000194 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
195 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000196 return true;
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000197
198 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
199 // TODO: The set of packed operations is more limited, so may want to
200 // promote some anyway.
201 if (ST->hasVOP3PInsts())
202 return false;
203
204 return needsPromotionToI32(VT->getElementType());
205 }
206
207 return false;
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000208}
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000209
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000210// Return true if the op promoted to i32 should have nsw set.
211static bool promotedOpIsNSW(const Instruction &I) {
212 switch (I.getOpcode()) {
213 case Instruction::Shl:
214 case Instruction::Add:
215 case Instruction::Sub:
216 return true;
217 case Instruction::Mul:
218 return I.hasNoUnsignedWrap();
219 default:
220 return false;
221 }
222}
223
224// Return true if the op promoted to i32 should have nuw set.
225static bool promotedOpIsNUW(const Instruction &I) {
226 switch (I.getOpcode()) {
227 case Instruction::Shl:
228 case Instruction::Add:
229 case Instruction::Mul:
230 return true;
231 case Instruction::Sub:
232 return I.hasNoUnsignedWrap();
233 default:
234 return false;
235 }
236}
237
Wei Dinga126a132017-07-26 21:07:28 +0000238bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
239 Type *Ty = I.getType();
240 const DataLayout &DL = Mod->getDataLayout();
241 int TySize = DL.getTypeSizeInBits(Ty);
242 unsigned Align = I.getAlignment() ?
243 I.getAlignment() : DL.getABITypeAlignment(Ty);
244
245 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
246}
247
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000248bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
249 assert(needsPromotionToI32(I.getType()) &&
250 "I does not need promotion to i32");
251
252 if (I.getOpcode() == Instruction::SDiv ||
253 I.getOpcode() == Instruction::UDiv)
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000254 return false;
255
256 IRBuilder<> Builder(&I);
257 Builder.SetCurrentDebugLocation(I.getDebugLoc());
258
259 Type *I32Ty = getI32Ty(Builder, I.getType());
260 Value *ExtOp0 = nullptr;
261 Value *ExtOp1 = nullptr;
262 Value *ExtRes = nullptr;
263 Value *TruncRes = nullptr;
264
265 if (isSigned(I)) {
266 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
267 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
268 } else {
269 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
270 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
271 }
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000272
273 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
274 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
275 if (promotedOpIsNSW(cast<Instruction>(I)))
276 Inst->setHasNoSignedWrap();
277
278 if (promotedOpIsNUW(cast<Instruction>(I)))
279 Inst->setHasNoUnsignedWrap();
280
281 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
282 Inst->setIsExact(ExactOp->isExact());
283 }
284
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000285 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000286
287 I.replaceAllUsesWith(TruncRes);
288 I.eraseFromParent();
289
290 return true;
291}
292
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000293bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
294 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
295 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000296
297 IRBuilder<> Builder(&I);
298 Builder.SetCurrentDebugLocation(I.getDebugLoc());
299
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000300 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000301 Value *ExtOp0 = nullptr;
302 Value *ExtOp1 = nullptr;
303 Value *NewICmp = nullptr;
304
305 if (I.isSigned()) {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000306 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
307 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000308 } else {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000309 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
310 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000311 }
312 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
313
314 I.replaceAllUsesWith(NewICmp);
315 I.eraseFromParent();
316
317 return true;
318}
319
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000320bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
321 assert(needsPromotionToI32(I.getType()) &&
322 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000323
324 IRBuilder<> Builder(&I);
325 Builder.SetCurrentDebugLocation(I.getDebugLoc());
326
327 Type *I32Ty = getI32Ty(Builder, I.getType());
328 Value *ExtOp1 = nullptr;
329 Value *ExtOp2 = nullptr;
330 Value *ExtRes = nullptr;
331 Value *TruncRes = nullptr;
332
333 if (isSigned(I)) {
334 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
335 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
336 } else {
337 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
338 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
339 }
340 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000341 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000342
343 I.replaceAllUsesWith(TruncRes);
344 I.eraseFromParent();
345
346 return true;
347}
348
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000349bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000350 IntrinsicInst &I) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000351 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
352 "I must be bitreverse intrinsic");
353 assert(needsPromotionToI32(I.getType()) &&
354 "I does not need promotion to i32");
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000355
356 IRBuilder<> Builder(&I);
357 Builder.SetCurrentDebugLocation(I.getDebugLoc());
358
359 Type *I32Ty = getI32Ty(Builder, I.getType());
360 Function *I32 =
Konstantin Zhuravlyovc09e2d72016-10-07 14:39:53 +0000361 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000362 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
363 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000364 Value *LShrOp =
365 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000366 Value *TruncRes =
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000367 Builder.CreateTrunc(LShrOp, I.getType());
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000368
369 I.replaceAllUsesWith(TruncRes);
370 I.eraseFromParent();
371
372 return true;
373}
374
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000375static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
376 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
377 if (!CNum)
378 return false;
379
380 // Reciprocal f32 is handled separately without denormals.
Matt Arsenaulte3862cd2016-07-26 23:25:44 +0000381 return UnsafeDiv || CNum->isExactlyValue(+1.0);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000382}
383
384// Insert an intrinsic for fast fdiv for safe math situations where we can
385// reduce precision. Leave fdiv for situations where the generic node is
386// expected to be optimized.
387bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
388 Type *Ty = FDiv.getType();
389
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000390 if (!Ty->getScalarType()->isFloatTy())
391 return false;
392
393 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
394 if (!FPMath)
395 return false;
396
397 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
398 float ULP = FPOp->getFPAccuracy();
399 if (ULP < 2.5f)
400 return false;
401
402 FastMathFlags FMF = FPOp->getFastMathFlags();
Sanjay Patel629c4112017-11-06 16:27:15 +0000403 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000404 FMF.allowReciprocal();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +0000405
406 // With UnsafeDiv node will be optimized to just rcp and mul.
407 if (ST->hasFP32Denormals() || UnsafeDiv)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000408 return false;
409
410 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
411 Builder.setFastMathFlags(FMF);
412 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
413
Matt Arsenaultc5b641a2017-03-17 20:41:45 +0000414 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000415
416 Value *Num = FDiv.getOperand(0);
417 Value *Den = FDiv.getOperand(1);
418
419 Value *NewFDiv = nullptr;
420
421 if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
422 NewFDiv = UndefValue::get(VT);
423
424 // FIXME: Doesn't do the right thing for cases where the vector is partially
425 // constant. This works when the scalarizer pass is run first.
426 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
427 Value *NumEltI = Builder.CreateExtractElement(Num, I);
428 Value *DenEltI = Builder.CreateExtractElement(Den, I);
429 Value *NewElt;
430
431 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
432 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
433 } else {
434 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
435 }
436
437 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
438 }
439 } else {
440 if (!shouldKeepFDivF32(Num, UnsafeDiv))
441 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
442 }
443
444 if (NewFDiv) {
445 FDiv.replaceAllUsesWith(NewFDiv);
446 NewFDiv->takeName(&FDiv);
447 FDiv.eraseFromParent();
448 }
449
450 return true;
451}
452
453static bool hasUnsafeFPMath(const Function &F) {
454 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
455 return Attr.getValueAsString() == "true";
456}
457
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000458bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
459 bool Changed = false;
460
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000461 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
462 DA->isUniform(&I))
463 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000464
465 return Changed;
466}
467
Wei Dinga126a132017-07-26 21:07:28 +0000468bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
469 if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
470 canWidenScalarExtLoad(I)) {
471 IRBuilder<> Builder(&I);
472 Builder.SetCurrentDebugLocation(I.getDebugLoc());
473
474 Type *I32Ty = Builder.getInt32Ty();
475 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
476 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
477 Value *WidenLoad = Builder.CreateLoad(BitCast);
478
479 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
480 Type *IntNTy = Builder.getIntNTy(TySize);
481 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
482 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
483 I.replaceAllUsesWith(ValOrig);
484 I.eraseFromParent();
485 return true;
486 }
487
488 return false;
489}
490
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000491bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
492 bool Changed = false;
493
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000494 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
495 DA->isUniform(&I))
496 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000497
498 return Changed;
499}
500
501bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
502 bool Changed = false;
503
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000504 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
505 DA->isUniform(&I))
506 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000507
508 return Changed;
509}
510
511bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
512 switch (I.getIntrinsicID()) {
513 case Intrinsic::bitreverse:
514 return visitBitreverseIntrinsicInst(I);
515 default:
516 return false;
517 }
518}
519
520bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
521 bool Changed = false;
522
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000523 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
524 DA->isUniform(&I))
525 Changed |= promoteUniformBitreverseToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000526
527 return Changed;
528}
529
Matt Arsenault86de4862016-06-24 07:07:55 +0000530bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000531 Mod = &M;
Matt Arsenault86de4862016-06-24 07:07:55 +0000532 return false;
533}
534
535bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000536 if (skipFunction(F))
Matt Arsenault86de4862016-06-24 07:07:55 +0000537 return false;
538
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000539 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
540 if (!TPC)
541 return false;
542
543 const TargetMachine &TM = TPC->getTM<TargetMachine>();
544 ST = &TM.getSubtarget<SISubtarget>(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000545 DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000546 HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000547
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000548 bool MadeChange = false;
549
550 for (BasicBlock &BB : F) {
551 BasicBlock::iterator Next;
552 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
553 Next = std::next(I);
554 MadeChange |= visit(*I);
555 }
556 }
557
558 return MadeChange;
Matt Arsenault86de4862016-06-24 07:07:55 +0000559}
560
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000561INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
Matt Arsenault86de4862016-06-24 07:07:55 +0000562 "AMDGPU IR optimizations", false, false)
563INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000564INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
565 false, false)
Matt Arsenault86de4862016-06-24 07:07:55 +0000566
567char AMDGPUCodeGenPrepare::ID = 0;
568
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000569FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
570 return new AMDGPUCodeGenPrepare();
Matt Arsenault86de4862016-06-24 07:07:55 +0000571}