blob: 53fb9e3cf1c8cbd60a8b1e3ab4b05117cf7b0fec [file] [log] [blame]
Matt Arsenault86de4862016-06-24 07:07:55 +00001//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// This pass does misc. AMDGPU optimizations on IR before instruction
12/// selection.
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "AMDGPUSubtarget.h"
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000018#include "AMDGPUTargetMachine.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000019#include "llvm/ADT/StringRef.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000020#include "llvm/Analysis/DivergenceAnalysis.h"
Wei Dinga126a132017-07-26 21:07:28 +000021#include "llvm/Analysis/Loads.h"
Matt Arsenault86de4862016-06-24 07:07:55 +000022#include "llvm/CodeGen/Passes.h"
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +000023#include "llvm/CodeGen/TargetPassConfig.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000024#include "llvm/IR/Attributes.h"
25#include "llvm/IR/BasicBlock.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/DerivedTypes.h"
28#include "llvm/IR/Function.h"
Chandler Carruth6bda14b2017-06-06 11:49:48 +000029#include "llvm/IR/IRBuilder.h"
30#include "llvm/IR/InstVisitor.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000031#include "llvm/IR/InstrTypes.h"
32#include "llvm/IR/Instruction.h"
33#include "llvm/IR/Instructions.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000034#include "llvm/IR/IntrinsicInst.h"
35#include "llvm/IR/Intrinsics.h"
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000036#include "llvm/IR/LLVMContext.h"
37#include "llvm/IR/Operator.h"
38#include "llvm/IR/Type.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Pass.h"
41#include "llvm/Support/Casting.h"
42#include <cassert>
43#include <iterator>
Matt Arsenault86de4862016-06-24 07:07:55 +000044
45#define DEBUG_TYPE "amdgpu-codegenprepare"
46
47using namespace llvm;
48
49namespace {
50
51class AMDGPUCodeGenPrepare : public FunctionPass,
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +000052 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
Eugene Zelenko734bb7b2017-01-20 17:52:16 +000053 const SISubtarget *ST = nullptr;
54 DivergenceAnalysis *DA = nullptr;
55 Module *Mod = nullptr;
56 bool HasUnsafeFPMath = false;
Wei Dinga126a132017-07-26 21:07:28 +000057 AMDGPUAS AMDGPUASI;
Matt Arsenault86de4862016-06-24 07:07:55 +000058
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000059 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000060 /// binary operation \p V.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000061 ///
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000062 /// \returns Binary operation \p V.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000063 /// \returns \p T's base element bit width.
64 unsigned getBaseElementBitWidth(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000065
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000066 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
67 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
68 /// is returned.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000069 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
70
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000071 /// \returns True if binary operation \p I is a signed binary operation, false
72 /// otherwise.
73 bool isSigned(const BinaryOperator &I) const;
74
75 /// \returns True if the condition of 'select' operation \p I comes from a
76 /// signed 'icmp' operation, false otherwise.
77 bool isSigned(const SelectInst &I) const;
78
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000079 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
80 /// false otherwise.
81 bool needsPromotionToI32(const Type *T) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +000082
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000083 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000084 /// operation.
85 ///
86 /// \details \p I's base element bit width must be greater than 1 and less
87 /// than or equal 16. Promotion is done by sign or zero extending operands to
88 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
89 /// truncating the result of 32 bit binary operation back to \p I's original
90 /// type. Division operation is not promoted.
91 ///
92 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
93 /// false otherwise.
94 bool promoteUniformOpToI32(BinaryOperator &I) const;
95
Adrian Prantl5f8f34e42018-05-01 15:54:18 +000096 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +000097 ///
98 /// \details \p I's base element bit width must be greater than 1 and less
99 /// than or equal 16. Promotion is done by sign or zero extending operands to
100 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000101 ///
102 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000103 bool promoteUniformOpToI32(ICmpInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000104
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000105 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000106 /// operation.
107 ///
108 /// \details \p I's base element bit width must be greater than 1 and less
109 /// than or equal 16. Promotion is done by sign or zero extending operands to
110 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
111 /// result of 32 bit 'select' operation back to \p I's original type.
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000112 ///
113 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000114 bool promoteUniformOpToI32(SelectInst &I) const;
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000115
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000116 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000117 /// intrinsic.
118 ///
119 /// \details \p I's base element bit width must be greater than 1 and less
120 /// than or equal 16. Promotion is done by zero extending the operand to 32
121 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
122 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
123 /// shift amount is 32 minus \p I's base element bit width), and truncating
124 /// the result of the shift operation back to \p I's original type.
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000125 ///
126 /// \returns True.
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000127 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
Adrian Prantl5f8f34e42018-05-01 15:54:18 +0000128 /// Widen a scalar load.
Wei Dinga126a132017-07-26 21:07:28 +0000129 ///
130 /// \details \p Widen scalar load for uniform, small type loads from constant
131 // memory / to a full 32-bits and then truncate the input to allow a scalar
132 // load instead of a vector load.
133 //
134 /// \returns True.
135
136 bool canWidenScalarExtLoad(LoadInst &I) const;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000137
Matt Arsenault86de4862016-06-24 07:07:55 +0000138public:
139 static char ID;
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000140
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000141 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000142
143 bool visitFDiv(BinaryOperator &I);
144
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000145 bool visitInstruction(Instruction &I) { return false; }
146 bool visitBinaryOperator(BinaryOperator &I);
Wei Dinga126a132017-07-26 21:07:28 +0000147 bool visitLoadInst(LoadInst &I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000148 bool visitICmpInst(ICmpInst &I);
149 bool visitSelectInst(SelectInst &I);
Matt Arsenault86de4862016-06-24 07:07:55 +0000150
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000151 bool visitIntrinsicInst(IntrinsicInst &I);
152 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
153
Matt Arsenault86de4862016-06-24 07:07:55 +0000154 bool doInitialization(Module &M) override;
155 bool runOnFunction(Function &F) override;
156
Mehdi Amini117296c2016-10-01 02:56:57 +0000157 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
Matt Arsenault86de4862016-06-24 07:07:55 +0000158
159 void getAnalysisUsage(AnalysisUsage &AU) const override {
160 AU.addRequired<DivergenceAnalysis>();
161 AU.setPreservesAll();
162 }
163};
164
Eugene Zelenko734bb7b2017-01-20 17:52:16 +0000165} // end anonymous namespace
Matt Arsenault86de4862016-06-24 07:07:55 +0000166
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000167unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
168 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000169
170 if (T->isIntegerTy())
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000171 return T->getIntegerBitWidth();
172 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000173}
174
175Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000176 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000177
178 if (T->isIntegerTy())
179 return B.getInt32Ty();
180 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
181}
182
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000183bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
Konstantin Zhuravlyov691e2e02016-10-03 18:29:01 +0000184 return I.getOpcode() == Instruction::AShr ||
185 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000186}
187
188bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
189 return isa<ICmpInst>(I.getOperand(0)) ?
190 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
191}
192
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000193bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000194 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
195 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000196 return true;
Matt Arsenaulteb522e62017-02-27 22:15:25 +0000197
198 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
199 // TODO: The set of packed operations is more limited, so may want to
200 // promote some anyway.
201 if (ST->hasVOP3PInsts())
202 return false;
203
204 return needsPromotionToI32(VT->getElementType());
205 }
206
207 return false;
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000208}
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000209
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000210// Return true if the op promoted to i32 should have nsw set.
211static bool promotedOpIsNSW(const Instruction &I) {
212 switch (I.getOpcode()) {
213 case Instruction::Shl:
214 case Instruction::Add:
215 case Instruction::Sub:
216 return true;
217 case Instruction::Mul:
218 return I.hasNoUnsignedWrap();
219 default:
220 return false;
221 }
222}
223
224// Return true if the op promoted to i32 should have nuw set.
225static bool promotedOpIsNUW(const Instruction &I) {
226 switch (I.getOpcode()) {
227 case Instruction::Shl:
228 case Instruction::Add:
229 case Instruction::Mul:
230 return true;
231 case Instruction::Sub:
232 return I.hasNoUnsignedWrap();
233 default:
234 return false;
235 }
236}
237
Wei Dinga126a132017-07-26 21:07:28 +0000238bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
239 Type *Ty = I.getType();
240 const DataLayout &DL = Mod->getDataLayout();
241 int TySize = DL.getTypeSizeInBits(Ty);
242 unsigned Align = I.getAlignment() ?
243 I.getAlignment() : DL.getABITypeAlignment(Ty);
244
245 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
246}
247
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000248bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
249 assert(needsPromotionToI32(I.getType()) &&
250 "I does not need promotion to i32");
251
252 if (I.getOpcode() == Instruction::SDiv ||
253 I.getOpcode() == Instruction::UDiv)
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000254 return false;
255
256 IRBuilder<> Builder(&I);
257 Builder.SetCurrentDebugLocation(I.getDebugLoc());
258
259 Type *I32Ty = getI32Ty(Builder, I.getType());
260 Value *ExtOp0 = nullptr;
261 Value *ExtOp1 = nullptr;
262 Value *ExtRes = nullptr;
263 Value *TruncRes = nullptr;
264
265 if (isSigned(I)) {
266 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
267 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
268 } else {
269 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
270 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
271 }
Matt Arsenaultd59e6402017-02-01 16:25:23 +0000272
273 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
274 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
275 if (promotedOpIsNSW(cast<Instruction>(I)))
276 Inst->setHasNoSignedWrap();
277
278 if (promotedOpIsNUW(cast<Instruction>(I)))
279 Inst->setHasNoUnsignedWrap();
280
281 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
282 Inst->setIsExact(ExactOp->isExact());
283 }
284
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000285 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000286
287 I.replaceAllUsesWith(TruncRes);
288 I.eraseFromParent();
289
290 return true;
291}
292
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000293bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
294 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
295 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000296
297 IRBuilder<> Builder(&I);
298 Builder.SetCurrentDebugLocation(I.getDebugLoc());
299
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000300 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000301 Value *ExtOp0 = nullptr;
302 Value *ExtOp1 = nullptr;
303 Value *NewICmp = nullptr;
304
305 if (I.isSigned()) {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000306 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
307 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000308 } else {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000309 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
310 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000311 }
312 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
313
314 I.replaceAllUsesWith(NewICmp);
315 I.eraseFromParent();
316
317 return true;
318}
319
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000320bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
321 assert(needsPromotionToI32(I.getType()) &&
322 "I does not need promotion to i32");
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000323
324 IRBuilder<> Builder(&I);
325 Builder.SetCurrentDebugLocation(I.getDebugLoc());
326
327 Type *I32Ty = getI32Ty(Builder, I.getType());
328 Value *ExtOp1 = nullptr;
329 Value *ExtOp2 = nullptr;
330 Value *ExtRes = nullptr;
331 Value *TruncRes = nullptr;
332
333 if (isSigned(I)) {
334 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
335 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
336 } else {
337 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
338 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
339 }
340 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000341 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000342
343 I.replaceAllUsesWith(TruncRes);
344 I.eraseFromParent();
345
346 return true;
347}
348
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000349bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000350 IntrinsicInst &I) const {
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000351 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
352 "I must be bitreverse intrinsic");
353 assert(needsPromotionToI32(I.getType()) &&
354 "I does not need promotion to i32");
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000355
356 IRBuilder<> Builder(&I);
357 Builder.SetCurrentDebugLocation(I.getDebugLoc());
358
359 Type *I32Ty = getI32Ty(Builder, I.getType());
360 Function *I32 =
Konstantin Zhuravlyovc09e2d72016-10-07 14:39:53 +0000361 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000362 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
363 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000364 Value *LShrOp =
365 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000366 Value *TruncRes =
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000367 Builder.CreateTrunc(LShrOp, I.getType());
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000368
369 I.replaceAllUsesWith(TruncRes);
370 I.eraseFromParent();
371
372 return true;
373}
374
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000375static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000376 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
377 if (!CNum)
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000378 return HasDenormals;
379
380 if (UnsafeDiv)
381 return true;
382
383 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000384
385 // Reciprocal f32 is handled separately without denormals.
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000386 return HasDenormals ^ IsOne;
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000387}
388
389// Insert an intrinsic for fast fdiv for safe math situations where we can
390// reduce precision. Leave fdiv for situations where the generic node is
391// expected to be optimized.
392bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
393 Type *Ty = FDiv.getType();
394
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000395 if (!Ty->getScalarType()->isFloatTy())
396 return false;
397
398 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
399 if (!FPMath)
400 return false;
401
402 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
403 float ULP = FPOp->getFPAccuracy();
404 if (ULP < 2.5f)
405 return false;
406
407 FastMathFlags FMF = FPOp->getFastMathFlags();
Sanjay Patel629c4112017-11-06 16:27:15 +0000408 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000409 FMF.allowReciprocal();
Stanislav Mekhanoshin9d7b1c92017-07-06 20:34:21 +0000410
411 // With UnsafeDiv node will be optimized to just rcp and mul.
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000412 if (UnsafeDiv)
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000413 return false;
414
415 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
416 Builder.setFastMathFlags(FMF);
417 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
418
Matt Arsenaultc5b641a2017-03-17 20:41:45 +0000419 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000420
421 Value *Num = FDiv.getOperand(0);
422 Value *Den = FDiv.getOperand(1);
423
424 Value *NewFDiv = nullptr;
425
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000426 bool HasDenormals = ST->hasFP32Denormals();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000427 if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
428 NewFDiv = UndefValue::get(VT);
429
430 // FIXME: Doesn't do the right thing for cases where the vector is partially
431 // constant. This works when the scalarizer pass is run first.
432 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
433 Value *NumEltI = Builder.CreateExtractElement(Num, I);
434 Value *DenEltI = Builder.CreateExtractElement(Den, I);
435 Value *NewElt;
436
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000437 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000438 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
439 } else {
440 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
441 }
442
443 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
444 }
445 } else {
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000446 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000447 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
448 }
449
450 if (NewFDiv) {
451 FDiv.replaceAllUsesWith(NewFDiv);
452 NewFDiv->takeName(&FDiv);
453 FDiv.eraseFromParent();
454 }
455
Stanislav Mekhanoshindf61be72018-06-06 22:22:32 +0000456 return !!NewFDiv;
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000457}
458
459static bool hasUnsafeFPMath(const Function &F) {
460 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
461 return Attr.getValueAsString() == "true";
462}
463
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000464bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
465 bool Changed = false;
466
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000467 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
468 DA->isUniform(&I))
469 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000470
471 return Changed;
472}
473
Matt Arsenault57e541e2018-06-05 19:52:56 +0000474bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Matt Arsenault923712b2018-02-09 16:57:57 +0000475 if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
476 I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
Wei Dinga126a132017-07-26 21:07:28 +0000477 canWidenScalarExtLoad(I)) {
478 IRBuilder<> Builder(&I);
479 Builder.SetCurrentDebugLocation(I.getDebugLoc());
480
481 Type *I32Ty = Builder.getInt32Ty();
482 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
483 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
Matt Arsenault57e541e2018-06-05 19:52:56 +0000484 LoadInst *WidenLoad = Builder.CreateLoad(BitCast);
485 WidenLoad->copyMetadata(I);
486
487 // If we have range metadata, we need to convert the type, and not make
488 // assumptions about the high bits.
489 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
490 ConstantInt *Lower =
491 mdconst::extract<ConstantInt>(Range->getOperand(0));
492
493 if (Lower->getValue().isNullValue()) {
494 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
495 } else {
496 Metadata *LowAndHigh[] = {
497 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
498 // Don't make assumptions about the high bits.
499 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
500 };
501
502 WidenLoad->setMetadata(LLVMContext::MD_range,
503 MDNode::get(Mod->getContext(), LowAndHigh));
504 }
505 }
Wei Dinga126a132017-07-26 21:07:28 +0000506
507 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
508 Type *IntNTy = Builder.getIntNTy(TySize);
509 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
510 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
511 I.replaceAllUsesWith(ValOrig);
512 I.eraseFromParent();
513 return true;
514 }
515
516 return false;
517}
518
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000519bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
520 bool Changed = false;
521
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000522 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
523 DA->isUniform(&I))
524 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000525
526 return Changed;
527}
528
529bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
530 bool Changed = false;
531
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000532 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
533 DA->isUniform(&I))
534 Changed |= promoteUniformOpToI32(I);
Konstantin Zhuravlyovb4eb5d52016-10-06 02:20:46 +0000535
536 return Changed;
537}
538
539bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
540 switch (I.getIntrinsicID()) {
541 case Intrinsic::bitreverse:
542 return visitBitreverseIntrinsicInst(I);
543 default:
544 return false;
545 }
546}
547
548bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
549 bool Changed = false;
550
Konstantin Zhuravlyovf74fc602016-10-07 14:22:58 +0000551 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
552 DA->isUniform(&I))
553 Changed |= promoteUniformBitreverseToI32(I);
Konstantin Zhuravlyove14df4b2016-09-28 20:05:39 +0000554
555 return Changed;
556}
557
Matt Arsenault86de4862016-06-24 07:07:55 +0000558bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000559 Mod = &M;
Matt Arsenault86de4862016-06-24 07:07:55 +0000560 return false;
561}
562
563bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000564 if (skipFunction(F))
Matt Arsenault86de4862016-06-24 07:07:55 +0000565 return false;
566
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000567 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
568 if (!TPC)
569 return false;
570
571 const TargetMachine &TM = TPC->getTM<TargetMachine>();
572 ST = &TM.getSubtarget<SISubtarget>(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000573 DA = &getAnalysis<DivergenceAnalysis>();
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000574 HasUnsafeFPMath = hasUnsafeFPMath(F);
Matt Arsenault86de4862016-06-24 07:07:55 +0000575
Matt Arsenaulta1fe17c2016-07-19 23:16:53 +0000576 bool MadeChange = false;
577
578 for (BasicBlock &BB : F) {
579 BasicBlock::iterator Next;
580 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
581 Next = std::next(I);
582 MadeChange |= visit(*I);
583 }
584 }
585
586 return MadeChange;
Matt Arsenault86de4862016-06-24 07:07:55 +0000587}
588
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000589INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
Matt Arsenault86de4862016-06-24 07:07:55 +0000590 "AMDGPU IR optimizations", false, false)
591INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000592INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
593 false, false)
Matt Arsenault86de4862016-06-24 07:07:55 +0000594
595char AMDGPUCodeGenPrepare::ID = 0;
596
Francis Visoiu Mistrih8b617642017-05-18 17:21:13 +0000597FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
598 return new AMDGPUCodeGenPrepare();
Matt Arsenault86de4862016-06-24 07:07:55 +0000599}