blob: faedf76c34775030171080fca257fcac5f952b0e [file] [log] [blame]
Matt Arsenault86de4862016-06-24 07:07:55 +00001//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// This pass does misc. AMDGPU optimizations on IR before instruction
12/// selection.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUSubtarget.h"
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000018#include "AMDGPUTargetMachine.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000019#include "llvm/ADT/StringRef.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000020#include "llvm/Analysis/DivergenceAnalysis.h"
Wei Dinga126a132017-07-26 21:07:28 +000021#include "llvm/Analysis/Loads.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000022#include "llvm/CodeGen/Passes.h"
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +000023#include "llvm/CodeGen/TargetPassConfig.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000024#include "llvm/IR/Attributes.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Function.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000029#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstVisitor.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000031#include "llvm/IR/InstrTypes.h"
32#include "llvm/IR/Instruction.h"
33#include "llvm/IR/Instructions.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000034#include "llvm/IR/IntrinsicInst.h"
35#include "llvm/IR/Intrinsics.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000036#include "llvm/IR/LLVMContext.h"
37#include "llvm/IR/Operator.h"
38#include "llvm/IR/Type.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/Casting.h"
42#include <cassert>
43#include <iterator>
Matt Arsenault86de4862016-06-24 07:07:55 +000044
45#define DEBUG_TYPE "amdgpu-codegenprepare"
46
47using namespace llvm;
48
49namespace {
50
Matt Arsenault90083d32018-06-07 09:54:49 +000051static cl::opt<bool> WidenLoads(
52 "amdgpu-codegenprepare-widen-constant-loads",
53 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 cl::ReallyHidden,
55 cl::init(true));
56
Matt Arsenault86de4862016-06-24 07:07:55 +000057class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000058 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000059 const SISubtarget *ST = nullptr;
60 DivergenceAnalysis *DA = nullptr;
61 Module *Mod = nullptr;
62 bool HasUnsafeFPMath = false;
Wei Dinga126a132017-07-26 21:07:28 +000063 AMDGPUAS AMDGPUASI;
Matt Arsenault86de4862016-06-24 07:07:55 +000064
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000065 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000066 /// binary operation \p V.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000067 ///
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000068 /// \returns Binary operation \p V.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000069 /// \returns \p T's base element bit width.
70 unsigned getBaseElementBitWidth(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000071
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000072 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
73 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
74 /// is returned.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000075 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
76
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000077 /// \returns True if binary operation \p I is a signed binary operation, false
78 /// otherwise.
79 bool isSigned(const BinaryOperator &I) const;
80
81 /// \returns True if the condition of 'select' operation \p I comes from a
82 /// signed 'icmp' operation, false otherwise.
83 bool isSigned(const SelectInst &I) const;
84
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000085 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
86 /// false otherwise.
87 bool needsPromotionToI32(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000088
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000089 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000090 /// operation.
91 ///
92 /// \details \p I's base element bit width must be greater than 1 and less
93 /// than or equal 16. Promotion is done by sign or zero extending operands to
94 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
95 /// truncating the result of 32 bit binary operation back to \p I's original
96 /// type. Division operation is not promoted.
97 ///
98 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
99 /// false otherwise.
100 bool promoteUniformOpToI32(BinaryOperator &I) const;
101
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000102 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000103 ///
104 /// \details \p I's base element bit width must be greater than 1 and less
105 /// than or equal 16. Promotion is done by sign or zero extending operands to
106 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000107 ///
108 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000109 bool promoteUniformOpToI32(ICmpInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000110
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000111 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000112 /// operation.
113 ///
114 /// \details \p I's base element bit width must be greater than 1 and less
115 /// than or equal 16. Promotion is done by sign or zero extending operands to
116 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
117 /// result of 32 bit 'select' operation back to \p I's original type.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000118 ///
119 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000120 bool promoteUniformOpToI32(SelectInst &I) const;
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000121
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000122 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000123 /// intrinsic.
124 ///
125 /// \details \p I's base element bit width must be greater than 1 and less
126 /// than or equal 16. Promotion is done by zero extending the operand to 32
127 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
128 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
129 /// shift amount is 32 minus \p I's base element bit width), and truncating
130 /// the result of the shift operation back to \p I's original type.
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000131 ///
132 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000133 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000134 /// Widen a scalar load.
Wei Dinga126a132017-07-26 21:07:28 +0000135 ///
136 /// \details \p Widen scalar load for uniform, small type loads from constant
137 // memory / to a full 32-bits and then truncate the input to allow a scalar
138 // load instead of a vector load.
139 //
140 /// \returns True.
141
142 bool canWidenScalarExtLoad(LoadInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000143
Matt Arsenault86de4862016-06-24 07:07:55 +0000144public:
145 static char ID;
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000146
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000147 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000148
149 bool visitFDiv(BinaryOperator &I);
150
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000151 bool visitInstruction(Instruction &I) { return false; }
152 bool visitBinaryOperator(BinaryOperator &I);
Wei Dinga126a132017-07-26 21:07:28 +0000153 bool visitLoadInst(LoadInst &I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000154 bool visitICmpInst(ICmpInst &I);
155 bool visitSelectInst(SelectInst &I);
Matt Arsenault86de4862016-06-24 07:07:55 +0000156
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000157 bool visitIntrinsicInst(IntrinsicInst &I);
158 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
159
Matt Arsenault86de4862016-06-24 07:07:55 +0000160 bool doInitialization(Module &M) override;
161 bool runOnFunction(Function &F) override;
162
Mehdi Amini117296c2016-10-01 02:56:57 +0000163 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
Matt Arsenault86de4862016-06-24 07:07:55 +0000164
165 void getAnalysisUsage(AnalysisUsage &AU) const override {
166 AU.addRequired<DivergenceAnalysis>();
167 AU.setPreservesAll();
168 }
169};
170
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000171} // end anonymous namespace
Matt Arsenault86de4862016-06-24 07:07:55 +0000172
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000173unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
174 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000175
176 if (T->isIntegerTy())
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000177 return T->getIntegerBitWidth();
178 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000179}
180
181Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000182 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000183
184 if (T->isIntegerTy())
185 return B.getInt32Ty();
186 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
187}
188
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000189bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
Konstantin Zhuravlyov691e2e02016-10-03 18:29:01 +0000190 return I.getOpcode() == Instruction::AShr ||
191 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000192}
193
194bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
195 return isa<ICmpInst>(I.getOperand(0)) ?
196 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
197}
198
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000199bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000200 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
201 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000202 return true;
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000203
204 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
205 // TODO: The set of packed operations is more limited, so may want to
206 // promote some anyway.
207 if (ST->hasVOP3PInsts())
208 return false;
209
210 return needsPromotionToI32(VT->getElementType());
211 }
212
213 return false;
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000214}
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000215
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000216// Return true if the op promoted to i32 should have nsw set.
217static bool promotedOpIsNSW(const Instruction &I) {
218 switch (I.getOpcode()) {
219 case Instruction::Shl:
220 case Instruction::Add:
221 case Instruction::Sub:
222 return true;
223 case Instruction::Mul:
224 return I.hasNoUnsignedWrap();
225 default:
226 return false;
227 }
228}
229
230// Return true if the op promoted to i32 should have nuw set.
231static bool promotedOpIsNUW(const Instruction &I) {
232 switch (I.getOpcode()) {
233 case Instruction::Shl:
234 case Instruction::Add:
235 case Instruction::Mul:
236 return true;
237 case Instruction::Sub:
238 return I.hasNoUnsignedWrap();
239 default:
240 return false;
241 }
242}
243
Wei Dinga126a132017-07-26 21:07:28 +0000244bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
245 Type *Ty = I.getType();
246 const DataLayout &DL = Mod->getDataLayout();
247 int TySize = DL.getTypeSizeInBits(Ty);
248 unsigned Align = I.getAlignment() ?
249 I.getAlignment() : DL.getABITypeAlignment(Ty);
250
251 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
252}
253
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000254bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
255 assert(needsPromotionToI32(I.getType()) &&
256 "I does not need promotion to i32");
257
258 if (I.getOpcode() == Instruction::SDiv ||
259 I.getOpcode() == Instruction::UDiv)
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000260 return false;
261
262 IRBuilder<> Builder(&I);
263 Builder.SetCurrentDebugLocation(I.getDebugLoc());
264
265 Type *I32Ty = getI32Ty(Builder, I.getType());
266 Value *ExtOp0 = nullptr;
267 Value *ExtOp1 = nullptr;
268 Value *ExtRes = nullptr;
269 Value *TruncRes = nullptr;
270
271 if (isSigned(I)) {
272 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
273 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
274 } else {
275 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
276 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
277 }
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000278
279 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
280 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
281 if (promotedOpIsNSW(cast<Instruction>(I)))
282 Inst->setHasNoSignedWrap();
283
284 if (promotedOpIsNUW(cast<Instruction>(I)))
285 Inst->setHasNoUnsignedWrap();
286
287 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
288 Inst->setIsExact(ExactOp->isExact());
289 }
290
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000291 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000292
293 I.replaceAllUsesWith(TruncRes);
294 I.eraseFromParent();
295
296 return true;
297}
298
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000299bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
300 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
301 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000302
303 IRBuilder<> Builder(&I);
304 Builder.SetCurrentDebugLocation(I.getDebugLoc());
305
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000306 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000307 Value *ExtOp0 = nullptr;
308 Value *ExtOp1 = nullptr;
309 Value *NewICmp = nullptr;
310
311 if (I.isSigned()) {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000312 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
313 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000314 } else {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000315 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
316 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000317 }
318 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
319
320 I.replaceAllUsesWith(NewICmp);
321 I.eraseFromParent();
322
323 return true;
324}
325
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000326bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
327 assert(needsPromotionToI32(I.getType()) &&
328 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000329
330 IRBuilder<> Builder(&I);
331 Builder.SetCurrentDebugLocation(I.getDebugLoc());
332
333 Type *I32Ty = getI32Ty(Builder, I.getType());
334 Value *ExtOp1 = nullptr;
335 Value *ExtOp2 = nullptr;
336 Value *ExtRes = nullptr;
337 Value *TruncRes = nullptr;
338
339 if (isSigned(I)) {
340 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
341 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
342 } else {
343 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
344 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
345 }
346 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000347 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000348
349 I.replaceAllUsesWith(TruncRes);
350 I.eraseFromParent();
351
352 return true;
353}
354
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000355bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000356 IntrinsicInst &I) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000357 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
358 "I must be bitreverse intrinsic");
359 assert(needsPromotionToI32(I.getType()) &&
360 "I does not need promotion to i32");
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000361
362 IRBuilder<> Builder(&I);
363 Builder.SetCurrentDebugLocation(I.getDebugLoc());
364
365 Type *I32Ty = getI32Ty(Builder, I.getType());
366 Function *I32 =
Konstantin Zhuravlyovc09e2d72016-10-07 14:39:53 +0000367 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000368 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
369 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000370 Value *LShrOp =
371 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000372 Value *TruncRes =
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000373 Builder.CreateTrunc(LShrOp, I.getType());
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000374
375 I.replaceAllUsesWith(TruncRes);
376 I.eraseFromParent();
377
378 return true;
379}
380
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000381static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000382 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
383 if (!CNum)
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000384 return HasDenormals;
385
386 if (UnsafeDiv)
387 return true;
388
389 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000390
391 // Reciprocal f32 is handled separately without denormals.
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000392 return HasDenormals ^ IsOne;
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000393}
394
395// Insert an intrinsic for fast fdiv for safe math situations where we can
396// reduce precision. Leave fdiv for situations where the generic node is
397// expected to be optimized.
398bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
399 Type *Ty = FDiv.getType();
400
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000401 if (!Ty->getScalarType()->isFloatTy())
402 return false;
403
404 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
405 if (!FPMath)
406 return false;
407
408 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
409 float ULP = FPOp->getFPAccuracy();
410 if (ULP < 2.5f)
411 return false;
412
413 FastMathFlags FMF = FPOp->getFastMathFlags();
Sanjay Patel629c4112017-11-06 16:27:15 +0000414 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000415 FMF.allowReciprocal();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +0000416
417 // With UnsafeDiv node will be optimized to just rcp and mul.
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000418 if (UnsafeDiv)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000419 return false;
420
421 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
422 Builder.setFastMathFlags(FMF);
423 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
424
Matt Arsenaultc5b641a2017-03-17 20:41:45 +0000425 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000426
427 Value *Num = FDiv.getOperand(0);
428 Value *Den = FDiv.getOperand(1);
429
430 Value *NewFDiv = nullptr;
431
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000432 bool HasDenormals = ST->hasFP32Denormals();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000433 if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
434 NewFDiv = UndefValue::get(VT);
435
436 // FIXME: Doesn't do the right thing for cases where the vector is partially
437 // constant. This works when the scalarizer pass is run first.
438 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
439 Value *NumEltI = Builder.CreateExtractElement(Num, I);
440 Value *DenEltI = Builder.CreateExtractElement(Den, I);
441 Value *NewElt;
442
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000443 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000444 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
445 } else {
446 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
447 }
448
449 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
450 }
451 } else {
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000452 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000453 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
454 }
455
456 if (NewFDiv) {
457 FDiv.replaceAllUsesWith(NewFDiv);
458 NewFDiv->takeName(&FDiv);
459 FDiv.eraseFromParent();
460 }
461
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000462 return !!NewFDiv;
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000463}
464
465static bool hasUnsafeFPMath(const Function &F) {
466 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
467 return Attr.getValueAsString() == "true";
468}
469
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000470bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
471 bool Changed = false;
472
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000473 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
474 DA->isUniform(&I))
475 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000476
477 return Changed;
478}
479
Matt Arsenault57e541e2018-06-05 19:52:56 +0000480bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Matt Arsenault90083d32018-06-07 09:54:49 +0000481 if (!WidenLoads)
482 return false;
483
Matt Arsenault923712b2018-02-09 16:57:57 +0000484 if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
485 I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
Wei Dinga126a132017-07-26 21:07:28 +0000486 canWidenScalarExtLoad(I)) {
487 IRBuilder<> Builder(&I);
488 Builder.SetCurrentDebugLocation(I.getDebugLoc());
489
490 Type *I32Ty = Builder.getInt32Ty();
491 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
492 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
Matt Arsenault57e541e2018-06-05 19:52:56 +0000493 LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
494 WidenLoad->copyMetadata(I);
495
496 // If we have range metadata, we need to convert the type, and not make
497 // assumptions about the high bits.
498 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
499 ConstantInt *Lower =
500 mdconst::extract<ConstantInt>(Range->getOperand(0));
501
502 if (Lower->getValue().isNullValue()) {
503 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
504 } else {
505 Metadata *LowAndHigh[] = {
506 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
507 // Don't make assumptions about the high bits.
508 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
509 };
510
511 WidenLoad->setMetadata(LLVMContext::MD_range,
512 MDNode::get(Mod->getContext(), LowAndHigh));
513 }
514 }
Wei Dinga126a132017-07-26 21:07:28 +0000515
516 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
517 Type *IntNTy = Builder.getIntNTy(TySize);
518 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
519 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
520 I.replaceAllUsesWith(ValOrig);
521 I.eraseFromParent();
522 return true;
523 }
524
525 return false;
526}
527
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000528bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
529 bool Changed = false;
530
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000531 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
532 DA->isUniform(&I))
533 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000534
535 return Changed;
536}
537
538bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
539 bool Changed = false;
540
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000541 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
542 DA->isUniform(&I))
543 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000544
545 return Changed;
546}
547
548bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
549 switch (I.getIntrinsicID()) {
550 case Intrinsic::bitreverse:
551 return visitBitreverseIntrinsicInst(I);
552 default:
553 return false;
554 }
555}
556
557bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
558 bool Changed = false;
559
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000560 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
561 DA->isUniform(&I))
562 Changed |= promoteUniformBitreverseToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000563
564 return Changed;
565}
566
Matt Arsenault86de4862016-06-24 07:07:55 +0000567bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000568 Mod = &M;
Matt Arsenault86de4862016-06-24 07:07:55 +0000569 return false;
570}
571
572bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000573 if (skipFunction(F))
Matt Arsenault86de4862016-06-24 07:07:55 +0000574 return false;
575
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000576 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
577 if (!TPC)
578 return false;
579
Matt Arsenault12269dd2018-06-28 10:18:23 +0000580 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000581 ST = &TM.getSubtarget<SISubtarget>(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000582 DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000583 HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault12269dd2018-06-28 10:18:23 +0000584 AMDGPUASI = TM.getAMDGPUAS();
Matt Arsenault86de4862016-06-24 07:07:55 +0000585
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000586 bool MadeChange = false;
587
588 for (BasicBlock &BB : F) {
589 BasicBlock::iterator Next;
590 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
591 Next = std::next(I);
592 MadeChange |= visit(*I);
593 }
594 }
595
596 return MadeChange;
Matt Arsenault86de4862016-06-24 07:07:55 +0000597}
598
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000599INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
Matt Arsenault86de4862016-06-24 07:07:55 +0000600 "AMDGPU IR optimizations", false, false)
601INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000602INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
603 false, false)
Matt Arsenault86de4862016-06-24 07:07:55 +0000604
605char AMDGPUCodeGenPrepare::ID = 0;
606
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000607FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
608 return new AMDGPUCodeGenPrepare();
Matt Arsenault86de4862016-06-24 07:07:55 +0000609}