| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 1 | //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// | 
|  | 2 | // | 
|  | 3 | //                     The LLVM Compiler Infrastructure | 
|  | 4 | // | 
|  | 5 | // This file is distributed under the University of Illinois Open Source | 
|  | 6 | // License. See LICENSE.TXT for details. | 
|  | 7 | // | 
|  | 8 | //===----------------------------------------------------------------------===// | 
|  | 9 | // | 
|  | 10 | /// \file | 
|  | 11 | /// This pass does misc. AMDGPU optimizations on IR before instruction | 
|  | 12 | /// selection. | 
|  | 13 | // | 
|  | 14 | //===----------------------------------------------------------------------===// | 
|  | 15 |  | 
|  | 16 | #include "AMDGPU.h" | 
|  | 17 | #include "AMDGPUSubtarget.h" | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 18 | #include "AMDGPUTargetMachine.h" | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 19 | #include "llvm/ADT/StringRef.h" | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 20 | #include "llvm/Analysis/DivergenceAnalysis.h" | 
|  | 21 | #include "llvm/CodeGen/Passes.h" | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 22 | #include "llvm/CodeGen/TargetPassConfig.h" | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 23 | #include "llvm/IR/Attributes.h" | 
|  | 24 | #include "llvm/IR/BasicBlock.h" | 
|  | 25 | #include "llvm/IR/Constants.h" | 
|  | 26 | #include "llvm/IR/DerivedTypes.h" | 
|  | 27 | #include "llvm/IR/Function.h" | 
| Chandler Carruth | 6bda14b | 2017-06-06 11:49:48 +0000 | [diff] [blame] | 28 | #include "llvm/IR/IRBuilder.h" | 
|  | 29 | #include "llvm/IR/InstVisitor.h" | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 30 | #include "llvm/IR/InstrTypes.h" | 
|  | 31 | #include "llvm/IR/Instruction.h" | 
|  | 32 | #include "llvm/IR/Instructions.h" | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 33 | #include "llvm/IR/IntrinsicInst.h" | 
|  | 34 | #include "llvm/IR/Intrinsics.h" | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 35 | #include "llvm/IR/LLVMContext.h" | 
|  | 36 | #include "llvm/IR/Operator.h" | 
|  | 37 | #include "llvm/IR/Type.h" | 
|  | 38 | #include "llvm/IR/Value.h" | 
|  | 39 | #include "llvm/Pass.h" | 
|  | 40 | #include "llvm/Support/Casting.h" | 
|  | 41 | #include <cassert> | 
|  | 42 | #include <iterator> | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 43 |  | 
|  | 44 | #define DEBUG_TYPE "amdgpu-codegenprepare" | 
|  | 45 |  | 
|  | 46 | using namespace llvm; | 
|  | 47 |  | 
|  | 48 | namespace { | 
|  | 49 |  | 
|  | 50 | class AMDGPUCodeGenPrepare : public FunctionPass, | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 51 | public InstVisitor<AMDGPUCodeGenPrepare, bool> { | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 52 | const SISubtarget *ST = nullptr; | 
|  | 53 | DivergenceAnalysis *DA = nullptr; | 
|  | 54 | Module *Mod = nullptr; | 
|  | 55 | bool HasUnsafeFPMath = false; | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 56 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 57 | /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to | 
|  | 58 | /// binary operation \p V. | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 59 | /// | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 60 | /// \returns Binary operation \p V. | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 61 | /// \returns \p T's base element bit width. | 
|  | 62 | unsigned getBaseElementBitWidth(const Type *T) const; | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 63 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 64 | /// \returns Equivalent 32 bit integer type for given type \p T. For example, | 
|  | 65 | /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> | 
|  | 66 | /// is returned. | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 67 | Type *getI32Ty(IRBuilder<> &B, const Type *T) const; | 
|  | 68 |  | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 69 | /// \returns True if binary operation \p I is a signed binary operation, false | 
|  | 70 | /// otherwise. | 
|  | 71 | bool isSigned(const BinaryOperator &I) const; | 
|  | 72 |  | 
|  | 73 | /// \returns True if the condition of 'select' operation \p I comes from a | 
|  | 74 | /// signed 'icmp' operation, false otherwise. | 
|  | 75 | bool isSigned(const SelectInst &I) const; | 
|  | 76 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 77 | /// \returns True if type \p T needs to be promoted to 32 bit integer type, | 
|  | 78 | /// false otherwise. | 
|  | 79 | bool needsPromotionToI32(const Type *T) const; | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 80 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 81 | /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary | 
|  | 82 | /// operation. | 
|  | 83 | /// | 
|  | 84 | /// \details \p I's base element bit width must be greater than 1 and less | 
|  | 85 | /// than or equal 16. Promotion is done by sign or zero extending operands to | 
|  | 86 | /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and | 
|  | 87 | /// truncating the result of 32 bit binary operation back to \p I's original | 
|  | 88 | /// type. Division operation is not promoted. | 
|  | 89 | /// | 
|  | 90 | /// \returns True if \p I is promoted to equivalent 32 bit binary operation, | 
|  | 91 | /// false otherwise. | 
|  | 92 | bool promoteUniformOpToI32(BinaryOperator &I) const; | 
|  | 93 |  | 
|  | 94 | /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. | 
|  | 95 | /// | 
|  | 96 | /// \details \p I's base element bit width must be greater than 1 and less | 
|  | 97 | /// than or equal 16. Promotion is done by sign or zero extending operands to | 
|  | 98 | /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 99 | /// | 
|  | 100 | /// \returns True. | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 101 | bool promoteUniformOpToI32(ICmpInst &I) const; | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 102 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 103 | /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' | 
|  | 104 | /// operation. | 
|  | 105 | /// | 
|  | 106 | /// \details \p I's base element bit width must be greater than 1 and less | 
|  | 107 | /// than or equal 16. Promotion is done by sign or zero extending operands to | 
|  | 108 | /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the | 
|  | 109 | /// result of 32 bit 'select' operation back to \p I's original type. | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 110 | /// | 
|  | 111 | /// \returns True. | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 112 | bool promoteUniformOpToI32(SelectInst &I) const; | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 113 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 114 | /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' | 
|  | 115 | /// intrinsic. | 
|  | 116 | /// | 
|  | 117 | /// \details \p I's base element bit width must be greater than 1 and less | 
|  | 118 | /// than or equal 16. Promotion is done by zero extending the operand to 32 | 
|  | 119 | /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the | 
|  | 120 | /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the | 
|  | 121 | /// shift amount is 32 minus \p I's base element bit width), and truncating | 
|  | 122 | /// the result of the shift operation back to \p I's original type. | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 123 | /// | 
|  | 124 | /// \returns True. | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 125 | bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 126 |  | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 127 | public: | 
|  | 128 | static char ID; | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 129 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 130 | AMDGPUCodeGenPrepare() : FunctionPass(ID) {} | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 131 |  | 
|  | 132 | bool visitFDiv(BinaryOperator &I); | 
|  | 133 |  | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 134 | bool visitInstruction(Instruction &I) { return false; } | 
|  | 135 | bool visitBinaryOperator(BinaryOperator &I); | 
|  | 136 | bool visitICmpInst(ICmpInst &I); | 
|  | 137 | bool visitSelectInst(SelectInst &I); | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 138 |  | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 139 | bool visitIntrinsicInst(IntrinsicInst &I); | 
|  | 140 | bool visitBitreverseIntrinsicInst(IntrinsicInst &I); | 
|  | 141 |  | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 142 | bool doInitialization(Module &M) override; | 
|  | 143 | bool runOnFunction(Function &F) override; | 
|  | 144 |  | 
| Mehdi Amini | 117296c | 2016-10-01 02:56:57 +0000 | [diff] [blame] | 145 | StringRef getPassName() const override { return "AMDGPU IR optimizations"; } | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 146 |  | 
|  | 147 | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | 148 | AU.addRequired<DivergenceAnalysis>(); | 
|  | 149 | AU.setPreservesAll(); | 
|  | 150 | } | 
|  | 151 | }; | 
|  | 152 |  | 
| Eugene Zelenko | 734bb7b | 2017-01-20 17:52:16 +0000 | [diff] [blame] | 153 | } // end anonymous namespace | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 154 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 155 | unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { | 
|  | 156 | assert(needsPromotionToI32(T) && "T does not need promotion to i32"); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 157 |  | 
|  | 158 | if (T->isIntegerTy()) | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 159 | return T->getIntegerBitWidth(); | 
|  | 160 | return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 161 | } | 
|  | 162 |  | 
|  | 163 | Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 164 | assert(needsPromotionToI32(T) && "T does not need promotion to i32"); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 165 |  | 
|  | 166 | if (T->isIntegerTy()) | 
|  | 167 | return B.getInt32Ty(); | 
|  | 168 | return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); | 
|  | 169 | } | 
|  | 170 |  | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 171 | bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { | 
| Konstantin Zhuravlyov | 691e2e0 | 2016-10-03 18:29:01 +0000 | [diff] [blame] | 172 | return I.getOpcode() == Instruction::AShr || | 
|  | 173 | I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 174 | } | 
|  | 175 |  | 
|  | 176 | bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { | 
|  | 177 | return isa<ICmpInst>(I.getOperand(0)) ? | 
|  | 178 | cast<ICmpInst>(I.getOperand(0))->isSigned() : false; | 
|  | 179 | } | 
|  | 180 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 181 | bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { | 
| Matt Arsenault | eb522e6 | 2017-02-27 22:15:25 +0000 | [diff] [blame] | 182 | const IntegerType *IntTy = dyn_cast<IntegerType>(T); | 
|  | 183 | if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 184 | return true; | 
| Matt Arsenault | eb522e6 | 2017-02-27 22:15:25 +0000 | [diff] [blame] | 185 |  | 
|  | 186 | if (const VectorType *VT = dyn_cast<VectorType>(T)) { | 
|  | 187 | // TODO: The set of packed operations is more limited, so may want to | 
|  | 188 | // promote some anyway. | 
|  | 189 | if (ST->hasVOP3PInsts()) | 
|  | 190 | return false; | 
|  | 191 |  | 
|  | 192 | return needsPromotionToI32(VT->getElementType()); | 
|  | 193 | } | 
|  | 194 |  | 
|  | 195 | return false; | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 196 | } | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 197 |  | 
| Matt Arsenault | d59e640 | 2017-02-01 16:25:23 +0000 | [diff] [blame] | 198 | // Return true if the op promoted to i32 should have nsw set. | 
|  | 199 | static bool promotedOpIsNSW(const Instruction &I) { | 
|  | 200 | switch (I.getOpcode()) { | 
|  | 201 | case Instruction::Shl: | 
|  | 202 | case Instruction::Add: | 
|  | 203 | case Instruction::Sub: | 
|  | 204 | return true; | 
|  | 205 | case Instruction::Mul: | 
|  | 206 | return I.hasNoUnsignedWrap(); | 
|  | 207 | default: | 
|  | 208 | return false; | 
|  | 209 | } | 
|  | 210 | } | 
|  | 211 |  | 
|  | 212 | // Return true if the op promoted to i32 should have nuw set. | 
|  | 213 | static bool promotedOpIsNUW(const Instruction &I) { | 
|  | 214 | switch (I.getOpcode()) { | 
|  | 215 | case Instruction::Shl: | 
|  | 216 | case Instruction::Add: | 
|  | 217 | case Instruction::Mul: | 
|  | 218 | return true; | 
|  | 219 | case Instruction::Sub: | 
|  | 220 | return I.hasNoUnsignedWrap(); | 
|  | 221 | default: | 
|  | 222 | return false; | 
|  | 223 | } | 
|  | 224 | } | 
|  | 225 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 226 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { | 
|  | 227 | assert(needsPromotionToI32(I.getType()) && | 
|  | 228 | "I does not need promotion to i32"); | 
|  | 229 |  | 
|  | 230 | if (I.getOpcode() == Instruction::SDiv || | 
|  | 231 | I.getOpcode() == Instruction::UDiv) | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 232 | return false; | 
|  | 233 |  | 
|  | 234 | IRBuilder<> Builder(&I); | 
|  | 235 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); | 
|  | 236 |  | 
|  | 237 | Type *I32Ty = getI32Ty(Builder, I.getType()); | 
|  | 238 | Value *ExtOp0 = nullptr; | 
|  | 239 | Value *ExtOp1 = nullptr; | 
|  | 240 | Value *ExtRes = nullptr; | 
|  | 241 | Value *TruncRes = nullptr; | 
|  | 242 |  | 
|  | 243 | if (isSigned(I)) { | 
|  | 244 | ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); | 
|  | 245 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); | 
|  | 246 | } else { | 
|  | 247 | ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); | 
|  | 248 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); | 
|  | 249 | } | 
| Matt Arsenault | d59e640 | 2017-02-01 16:25:23 +0000 | [diff] [blame] | 250 |  | 
|  | 251 | ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); | 
|  | 252 | if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { | 
|  | 253 | if (promotedOpIsNSW(cast<Instruction>(I))) | 
|  | 254 | Inst->setHasNoSignedWrap(); | 
|  | 255 |  | 
|  | 256 | if (promotedOpIsNUW(cast<Instruction>(I))) | 
|  | 257 | Inst->setHasNoUnsignedWrap(); | 
|  | 258 |  | 
|  | 259 | if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) | 
|  | 260 | Inst->setIsExact(ExactOp->isExact()); | 
|  | 261 | } | 
|  | 262 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 263 | TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 264 |  | 
|  | 265 | I.replaceAllUsesWith(TruncRes); | 
|  | 266 | I.eraseFromParent(); | 
|  | 267 |  | 
|  | 268 | return true; | 
|  | 269 | } | 
|  | 270 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 271 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { | 
|  | 272 | assert(needsPromotionToI32(I.getOperand(0)->getType()) && | 
|  | 273 | "I does not need promotion to i32"); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 274 |  | 
|  | 275 | IRBuilder<> Builder(&I); | 
|  | 276 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); | 
|  | 277 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 278 | Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 279 | Value *ExtOp0 = nullptr; | 
|  | 280 | Value *ExtOp1 = nullptr; | 
|  | 281 | Value *NewICmp  = nullptr; | 
|  | 282 |  | 
|  | 283 | if (I.isSigned()) { | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 284 | ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); | 
|  | 285 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 286 | } else { | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 287 | ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); | 
|  | 288 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 289 | } | 
|  | 290 | NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); | 
|  | 291 |  | 
|  | 292 | I.replaceAllUsesWith(NewICmp); | 
|  | 293 | I.eraseFromParent(); | 
|  | 294 |  | 
|  | 295 | return true; | 
|  | 296 | } | 
|  | 297 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 298 | bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { | 
|  | 299 | assert(needsPromotionToI32(I.getType()) && | 
|  | 300 | "I does not need promotion to i32"); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 301 |  | 
|  | 302 | IRBuilder<> Builder(&I); | 
|  | 303 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); | 
|  | 304 |  | 
|  | 305 | Type *I32Ty = getI32Ty(Builder, I.getType()); | 
|  | 306 | Value *ExtOp1 = nullptr; | 
|  | 307 | Value *ExtOp2 = nullptr; | 
|  | 308 | Value *ExtRes = nullptr; | 
|  | 309 | Value *TruncRes = nullptr; | 
|  | 310 |  | 
|  | 311 | if (isSigned(I)) { | 
|  | 312 | ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); | 
|  | 313 | ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); | 
|  | 314 | } else { | 
|  | 315 | ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); | 
|  | 316 | ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); | 
|  | 317 | } | 
|  | 318 | ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 319 | TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 320 |  | 
|  | 321 | I.replaceAllUsesWith(TruncRes); | 
|  | 322 | I.eraseFromParent(); | 
|  | 323 |  | 
|  | 324 | return true; | 
|  | 325 | } | 
|  | 326 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 327 | bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 328 | IntrinsicInst &I) const { | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 329 | assert(I.getIntrinsicID() == Intrinsic::bitreverse && | 
|  | 330 | "I must be bitreverse intrinsic"); | 
|  | 331 | assert(needsPromotionToI32(I.getType()) && | 
|  | 332 | "I does not need promotion to i32"); | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 333 |  | 
|  | 334 | IRBuilder<> Builder(&I); | 
|  | 335 | Builder.SetCurrentDebugLocation(I.getDebugLoc()); | 
|  | 336 |  | 
|  | 337 | Type *I32Ty = getI32Ty(Builder, I.getType()); | 
|  | 338 | Function *I32 = | 
| Konstantin Zhuravlyov | c09e2d7 | 2016-10-07 14:39:53 +0000 | [diff] [blame] | 339 | Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 340 | Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); | 
|  | 341 | Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 342 | Value *LShrOp = | 
|  | 343 | Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 344 | Value *TruncRes = | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 345 | Builder.CreateTrunc(LShrOp, I.getType()); | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 346 |  | 
|  | 347 | I.replaceAllUsesWith(TruncRes); | 
|  | 348 | I.eraseFromParent(); | 
|  | 349 |  | 
|  | 350 | return true; | 
|  | 351 | } | 
|  | 352 |  | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 353 | static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { | 
|  | 354 | const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); | 
|  | 355 | if (!CNum) | 
|  | 356 | return false; | 
|  | 357 |  | 
|  | 358 | // Reciprocal f32 is handled separately without denormals. | 
| Matt Arsenault | e3862cd | 2016-07-26 23:25:44 +0000 | [diff] [blame] | 359 | return UnsafeDiv || CNum->isExactlyValue(+1.0); | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 360 | } | 
|  | 361 |  | 
|  | 362 | // Insert an intrinsic for fast fdiv for safe math situations where we can | 
|  | 363 | // reduce precision. Leave fdiv for situations where the generic node is | 
|  | 364 | // expected to be optimized. | 
|  | 365 | bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { | 
|  | 366 | Type *Ty = FDiv.getType(); | 
|  | 367 |  | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 368 | if (!Ty->getScalarType()->isFloatTy()) | 
|  | 369 | return false; | 
|  | 370 |  | 
|  | 371 | MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); | 
|  | 372 | if (!FPMath) | 
|  | 373 | return false; | 
|  | 374 |  | 
|  | 375 | const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); | 
|  | 376 | float ULP = FPOp->getFPAccuracy(); | 
|  | 377 | if (ULP < 2.5f) | 
|  | 378 | return false; | 
|  | 379 |  | 
|  | 380 | FastMathFlags FMF = FPOp->getFastMathFlags(); | 
|  | 381 | bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || | 
|  | 382 | FMF.allowReciprocal(); | 
| Stanislav Mekhanoshin | 9d7b1c9 | 2017-07-06 20:34:21 +0000 | [diff] [blame] | 383 |  | 
|  | 384 | // With UnsafeDiv node will be optimized to just rcp and mul. | 
|  | 385 | if (ST->hasFP32Denormals() || UnsafeDiv) | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 386 | return false; | 
|  | 387 |  | 
|  | 388 | IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); | 
|  | 389 | Builder.setFastMathFlags(FMF); | 
|  | 390 | Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); | 
|  | 391 |  | 
| Matt Arsenault | c5b641a | 2017-03-17 20:41:45 +0000 | [diff] [blame] | 392 | Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 393 |  | 
|  | 394 | Value *Num = FDiv.getOperand(0); | 
|  | 395 | Value *Den = FDiv.getOperand(1); | 
|  | 396 |  | 
|  | 397 | Value *NewFDiv = nullptr; | 
|  | 398 |  | 
|  | 399 | if (VectorType *VT = dyn_cast<VectorType>(Ty)) { | 
|  | 400 | NewFDiv = UndefValue::get(VT); | 
|  | 401 |  | 
|  | 402 | // FIXME: Doesn't do the right thing for cases where the vector is partially | 
|  | 403 | // constant. This works when the scalarizer pass is run first. | 
|  | 404 | for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { | 
|  | 405 | Value *NumEltI = Builder.CreateExtractElement(Num, I); | 
|  | 406 | Value *DenEltI = Builder.CreateExtractElement(Den, I); | 
|  | 407 | Value *NewElt; | 
|  | 408 |  | 
|  | 409 | if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { | 
|  | 410 | NewElt = Builder.CreateFDiv(NumEltI, DenEltI); | 
|  | 411 | } else { | 
|  | 412 | NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); | 
|  | 413 | } | 
|  | 414 |  | 
|  | 415 | NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); | 
|  | 416 | } | 
|  | 417 | } else { | 
|  | 418 | if (!shouldKeepFDivF32(Num, UnsafeDiv)) | 
|  | 419 | NewFDiv = Builder.CreateCall(Decl, { Num, Den }); | 
|  | 420 | } | 
|  | 421 |  | 
|  | 422 | if (NewFDiv) { | 
|  | 423 | FDiv.replaceAllUsesWith(NewFDiv); | 
|  | 424 | NewFDiv->takeName(&FDiv); | 
|  | 425 | FDiv.eraseFromParent(); | 
|  | 426 | } | 
|  | 427 |  | 
|  | 428 | return true; | 
|  | 429 | } | 
|  | 430 |  | 
|  | 431 | static bool hasUnsafeFPMath(const Function &F) { | 
|  | 432 | Attribute Attr = F.getFnAttribute("unsafe-fp-math"); | 
|  | 433 | return Attr.getValueAsString() == "true"; | 
|  | 434 | } | 
|  | 435 |  | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 436 | bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { | 
|  | 437 | bool Changed = false; | 
|  | 438 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 439 | if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && | 
|  | 440 | DA->isUniform(&I)) | 
|  | 441 | Changed |= promoteUniformOpToI32(I); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 442 |  | 
|  | 443 | return Changed; | 
|  | 444 | } | 
|  | 445 |  | 
|  | 446 | bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { | 
|  | 447 | bool Changed = false; | 
|  | 448 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 449 | if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && | 
|  | 450 | DA->isUniform(&I)) | 
|  | 451 | Changed |= promoteUniformOpToI32(I); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 452 |  | 
|  | 453 | return Changed; | 
|  | 454 | } | 
|  | 455 |  | 
|  | 456 | bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { | 
|  | 457 | bool Changed = false; | 
|  | 458 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 459 | if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && | 
|  | 460 | DA->isUniform(&I)) | 
|  | 461 | Changed |= promoteUniformOpToI32(I); | 
| Konstantin Zhuravlyov | b4eb5d5 | 2016-10-06 02:20:46 +0000 | [diff] [blame] | 462 |  | 
|  | 463 | return Changed; | 
|  | 464 | } | 
|  | 465 |  | 
|  | 466 | bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { | 
|  | 467 | switch (I.getIntrinsicID()) { | 
|  | 468 | case Intrinsic::bitreverse: | 
|  | 469 | return visitBitreverseIntrinsicInst(I); | 
|  | 470 | default: | 
|  | 471 | return false; | 
|  | 472 | } | 
|  | 473 | } | 
|  | 474 |  | 
|  | 475 | bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { | 
|  | 476 | bool Changed = false; | 
|  | 477 |  | 
| Konstantin Zhuravlyov | f74fc60 | 2016-10-07 14:22:58 +0000 | [diff] [blame] | 478 | if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && | 
|  | 479 | DA->isUniform(&I)) | 
|  | 480 | Changed |= promoteUniformBitreverseToI32(I); | 
| Konstantin Zhuravlyov | e14df4b | 2016-09-28 20:05:39 +0000 | [diff] [blame] | 481 |  | 
|  | 482 | return Changed; | 
|  | 483 | } | 
|  | 484 |  | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 485 | bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 486 | Mod = &M; | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 487 | return false; | 
|  | 488 | } | 
|  | 489 |  | 
|  | 490 | bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 491 | if (skipFunction(F)) | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 492 | return false; | 
|  | 493 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 494 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); | 
|  | 495 | if (!TPC) | 
|  | 496 | return false; | 
|  | 497 |  | 
|  | 498 | const TargetMachine &TM = TPC->getTM<TargetMachine>(); | 
|  | 499 | ST = &TM.getSubtarget<SISubtarget>(F); | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 500 | DA = &getAnalysis<DivergenceAnalysis>(); | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 501 | HasUnsafeFPMath = hasUnsafeFPMath(F); | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 502 |  | 
| Matt Arsenault | a1fe17c | 2016-07-19 23:16:53 +0000 | [diff] [blame] | 503 | bool MadeChange = false; | 
|  | 504 |  | 
|  | 505 | for (BasicBlock &BB : F) { | 
|  | 506 | BasicBlock::iterator Next; | 
|  | 507 | for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { | 
|  | 508 | Next = std::next(I); | 
|  | 509 | MadeChange |= visit(*I); | 
|  | 510 | } | 
|  | 511 | } | 
|  | 512 |  | 
|  | 513 | return MadeChange; | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 514 | } | 
|  | 515 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 516 | INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 517 | "AMDGPU IR optimizations", false, false) | 
|  | 518 | INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 519 | INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", | 
|  | 520 | false, false) | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 521 |  | 
|  | 522 | char AMDGPUCodeGenPrepare::ID = 0; | 
|  | 523 |  | 
| Francis Visoiu Mistrih | 8b61764 | 2017-05-18 17:21:13 +0000 | [diff] [blame] | 524 | FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { | 
|  | 525 | return new AMDGPUCodeGenPrepare(); | 
| Matt Arsenault | 86de486 | 2016-06-24 07:07:55 +0000 | [diff] [blame] | 526 | } |