Tom Stellard | f98f2ce | 2012-12-11 21:25:42 +0000 | [diff] [blame^] | 1 | //===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | /// \file |
| 9 | //==-----------------------------------------------------------------------===// |
| 10 | |
| 11 | #define DEBUG_TYPE "PeepholeOpt" |
| 12 | #ifdef DEBUG |
| 13 | #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) |
| 14 | #else |
| 15 | #define DEBUGME 0 |
| 16 | #endif |
| 17 | |
| 18 | #include "AMDILDevices.h" |
| 19 | #include "AMDGPUInstrInfo.h" |
| 20 | #include "llvm/ADT/Statistic.h" |
| 21 | #include "llvm/ADT/StringExtras.h" |
| 22 | #include "llvm/ADT/StringRef.h" |
| 23 | #include "llvm/ADT/Twine.h" |
| 24 | #include "llvm/Constants.h" |
| 25 | #include "llvm/CodeGen/MachineFunction.h" |
| 26 | #include "llvm/CodeGen/MachineFunctionAnalysis.h" |
| 27 | #include "llvm/Function.h" |
| 28 | #include "llvm/Instructions.h" |
| 29 | #include "llvm/Module.h" |
| 30 | #include "llvm/Support/Debug.h" |
| 31 | #include "llvm/Support/MathExtras.h" |
| 32 | |
| 33 | #include <sstream> |
| 34 | |
| 35 | #if 0 |
| 36 | STATISTIC(PointerAssignments, "Number of dynamic pointer " |
| 37 | "assigments discovered"); |
| 38 | STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); |
| 39 | #endif |
| 40 | |
| 41 | using namespace llvm; |
| 42 | // The Peephole optimization pass is used to do simple last minute optimizations |
| 43 | // that are required for correct code or to remove redundant functions |
| 44 | namespace { |
| 45 | |
| 46 | class OpaqueType; |
| 47 | |
| 48 | class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { |
| 49 | public: |
| 50 | TargetMachine &TM; |
| 51 | static char ID; |
| 52 | AMDGPUPeepholeOpt(TargetMachine &tm); |
| 53 | ~AMDGPUPeepholeOpt(); |
| 54 | const char *getPassName() const; |
| 55 | bool runOnFunction(Function &F); |
| 56 | bool doInitialization(Module &M); |
| 57 | bool doFinalization(Module &M); |
| 58 | void getAnalysisUsage(AnalysisUsage &AU) const; |
| 59 | protected: |
| 60 | private: |
| 61 | // Function to initiate all of the instruction level optimizations. |
| 62 | bool instLevelOptimizations(BasicBlock::iterator *inst); |
| 63 | // Quick check to see if we need to dump all of the pointers into the |
| 64 | // arena. If this is correct, then we set all pointers to exist in arena. This |
| 65 | // is a workaround for aliasing of pointers in a struct/union. |
| 66 | bool dumpAllIntoArena(Function &F); |
| 67 | // Because I don't want to invalidate any pointers while in the |
| 68 | // safeNestedForEachFunction. I push atomic conversions to a vector and handle |
| 69 | // it later. This function does the conversions if required. |
| 70 | void doAtomicConversionIfNeeded(Function &F); |
| 71 | // Because __amdil_is_constant cannot be properly evaluated if |
| 72 | // optimizations are disabled, the call's are placed in a vector |
| 73 | // and evaluated after the __amdil_image* functions are evaluated |
| 74 | // which should allow the __amdil_is_constant function to be |
| 75 | // evaluated correctly. |
| 76 | void doIsConstCallConversionIfNeeded(); |
| 77 | bool mChanged; |
| 78 | bool mDebug; |
| 79 | bool mConvertAtomics; |
| 80 | CodeGenOpt::Level optLevel; |
| 81 | // Run a series of tests to see if we can optimize a CALL instruction. |
| 82 | bool optimizeCallInst(BasicBlock::iterator *bbb); |
| 83 | // A peephole optimization to optimize bit extract sequences. |
| 84 | bool optimizeBitExtract(Instruction *inst); |
| 85 | // A peephole optimization to optimize bit insert sequences. |
| 86 | bool optimizeBitInsert(Instruction *inst); |
| 87 | bool setupBitInsert(Instruction *base, |
| 88 | Instruction *&src, |
| 89 | Constant *&mask, |
| 90 | Constant *&shift); |
| 91 | // Expand the bit field insert instruction on versions of OpenCL that |
| 92 | // don't support it. |
| 93 | bool expandBFI(CallInst *CI); |
| 94 | // Expand the bit field mask instruction on version of OpenCL that |
| 95 | // don't support it. |
| 96 | bool expandBFM(CallInst *CI); |
| 97 | // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in |
| 98 | // this case we need to expand them. These functions check for 24bit functions |
| 99 | // and then expand. |
| 100 | bool isSigned24BitOps(CallInst *CI); |
| 101 | void expandSigned24BitOps(CallInst *CI); |
| 102 | // One optimization that can occur is that if the required workgroup size is |
| 103 | // specified then the result of get_local_size is known at compile time and |
| 104 | // can be returned accordingly. |
| 105 | bool isRWGLocalOpt(CallInst *CI); |
| 106 | // On northern island cards, the division is slightly less accurate than on |
| 107 | // previous generations, so we need to utilize a more accurate division. So we |
| 108 | // can translate the accurate divide to a normal divide on all other cards. |
| 109 | bool convertAccurateDivide(CallInst *CI); |
| 110 | void expandAccurateDivide(CallInst *CI); |
| 111 | // If the alignment is set incorrectly, it can produce really inefficient |
| 112 | // code. This checks for this scenario and fixes it if possible. |
| 113 | bool correctMisalignedMemOp(Instruction *inst); |
| 114 | |
| 115 | // If we are in no opt mode, then we need to make sure that |
| 116 | // local samplers are properly propagated as constant propagation |
| 117 | // doesn't occur and we need to know the value of kernel defined |
| 118 | // samplers at compile time. |
| 119 | bool propagateSamplerInst(CallInst *CI); |
| 120 | |
| 121 | // Helper functions |
| 122 | |
| 123 | // Group of functions that recursively calculate the size of a structure based |
| 124 | // on it's sub-types. |
| 125 | size_t getTypeSize(Type * const T, bool dereferencePtr = false); |
| 126 | size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); |
| 127 | size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); |
| 128 | size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); |
| 129 | size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); |
| 130 | size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); |
| 131 | size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); |
| 132 | size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); |
| 133 | |
| 134 | LLVMContext *mCTX; |
| 135 | Function *mF; |
| 136 | const AMDGPUSubtarget *mSTM; |
| 137 | SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; |
| 138 | SmallVector<CallInst *, 16> isConstVec; |
| 139 | }; // class AMDGPUPeepholeOpt |
| 140 | char AMDGPUPeepholeOpt::ID = 0; |
| 141 | |
| 142 | // A template function that has two levels of looping before calling the |
| 143 | // function with a pointer to the current iterator. |
| 144 | template<class InputIterator, class SecondIterator, class Function> |
| 145 | Function safeNestedForEach(InputIterator First, InputIterator Last, |
| 146 | SecondIterator S, Function F) { |
| 147 | for ( ; First != Last; ++First) { |
| 148 | SecondIterator sf, sl; |
| 149 | for (sf = First->begin(), sl = First->end(); |
| 150 | sf != sl; ) { |
| 151 | if (!F(&sf)) { |
| 152 | ++sf; |
| 153 | } |
| 154 | } |
| 155 | } |
| 156 | return F; |
| 157 | } |
| 158 | |
| 159 | } // anonymous namespace |
| 160 | |
| 161 | namespace llvm { |
| 162 | FunctionPass * |
| 163 | createAMDGPUPeepholeOpt(TargetMachine &tm) { |
| 164 | return new AMDGPUPeepholeOpt(tm); |
| 165 | } |
| 166 | } // llvm namespace |
| 167 | |
| 168 | AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) |
| 169 | : FunctionPass(ID), TM(tm) { |
| 170 | mDebug = DEBUGME; |
| 171 | optLevel = TM.getOptLevel(); |
| 172 | |
| 173 | } |
| 174 | |
| 175 | AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { |
| 176 | } |
| 177 | |
| 178 | const char * |
| 179 | AMDGPUPeepholeOpt::getPassName() const { |
| 180 | return "AMDGPU PeepHole Optimization Pass"; |
| 181 | } |
| 182 | |
| 183 | bool |
| 184 | containsPointerType(Type *Ty) { |
| 185 | if (!Ty) { |
| 186 | return false; |
| 187 | } |
| 188 | switch(Ty->getTypeID()) { |
| 189 | default: |
| 190 | return false; |
| 191 | case Type::StructTyID: { |
| 192 | const StructType *ST = dyn_cast<StructType>(Ty); |
| 193 | for (StructType::element_iterator stb = ST->element_begin(), |
| 194 | ste = ST->element_end(); stb != ste; ++stb) { |
| 195 | if (!containsPointerType(*stb)) { |
| 196 | continue; |
| 197 | } |
| 198 | return true; |
| 199 | } |
| 200 | break; |
| 201 | } |
| 202 | case Type::VectorTyID: |
| 203 | case Type::ArrayTyID: |
| 204 | return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); |
| 205 | case Type::PointerTyID: |
| 206 | return true; |
| 207 | }; |
| 208 | return false; |
| 209 | } |
| 210 | |
| 211 | bool |
| 212 | AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { |
| 213 | bool dumpAll = false; |
| 214 | for (Function::const_arg_iterator cab = F.arg_begin(), |
| 215 | cae = F.arg_end(); cab != cae; ++cab) { |
| 216 | const Argument *arg = cab; |
| 217 | const PointerType *PT = dyn_cast<PointerType>(arg->getType()); |
| 218 | if (!PT) { |
| 219 | continue; |
| 220 | } |
| 221 | Type *DereferencedType = PT->getElementType(); |
| 222 | if (!dyn_cast<StructType>(DereferencedType) |
| 223 | ) { |
| 224 | continue; |
| 225 | } |
| 226 | if (!containsPointerType(DereferencedType)) { |
| 227 | continue; |
| 228 | } |
| 229 | // FIXME: Because a pointer inside of a struct/union may be aliased to |
| 230 | // another pointer we need to take the conservative approach and place all |
| 231 | // pointers into the arena until more advanced detection is implemented. |
| 232 | dumpAll = true; |
| 233 | } |
| 234 | return dumpAll; |
| 235 | } |
| 236 | void |
| 237 | AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { |
| 238 | if (isConstVec.empty()) { |
| 239 | return; |
| 240 | } |
| 241 | for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { |
| 242 | CallInst *CI = isConstVec[x]; |
| 243 | Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
| 244 | Type *aType = Type::getInt32Ty(*mCTX); |
| 245 | Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
| 246 | : ConstantInt::get(aType, 0); |
| 247 | CI->replaceAllUsesWith(Val); |
| 248 | CI->eraseFromParent(); |
| 249 | } |
| 250 | isConstVec.clear(); |
| 251 | } |
| 252 | void |
| 253 | AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { |
| 254 | // Don't do anything if we don't have any atomic operations. |
| 255 | if (atomicFuncs.empty()) { |
| 256 | return; |
| 257 | } |
| 258 | // Change the function name for the atomic if it is required |
| 259 | uint32_t size = atomicFuncs.size(); |
| 260 | for (uint32_t x = 0; x < size; ++x) { |
| 261 | atomicFuncs[x].first->setOperand( |
| 262 | atomicFuncs[x].first->getNumOperands()-1, |
| 263 | atomicFuncs[x].second); |
| 264 | |
| 265 | } |
| 266 | mChanged = true; |
| 267 | if (mConvertAtomics) { |
| 268 | return; |
| 269 | } |
| 270 | } |
| 271 | |
| 272 | bool |
| 273 | AMDGPUPeepholeOpt::runOnFunction(Function &MF) { |
| 274 | mChanged = false; |
| 275 | mF = &MF; |
| 276 | mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); |
| 277 | if (mDebug) { |
| 278 | MF.dump(); |
| 279 | } |
| 280 | mCTX = &MF.getType()->getContext(); |
| 281 | mConvertAtomics = true; |
| 282 | safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), |
| 283 | std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), |
| 284 | this)); |
| 285 | |
| 286 | doAtomicConversionIfNeeded(MF); |
| 287 | doIsConstCallConversionIfNeeded(); |
| 288 | |
| 289 | if (mDebug) { |
| 290 | MF.dump(); |
| 291 | } |
| 292 | return mChanged; |
| 293 | } |
| 294 | |
| 295 | bool |
| 296 | AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { |
| 297 | Instruction *inst = (*bbb); |
| 298 | CallInst *CI = dyn_cast<CallInst>(inst); |
| 299 | if (!CI) { |
| 300 | return false; |
| 301 | } |
| 302 | if (isSigned24BitOps(CI)) { |
| 303 | expandSigned24BitOps(CI); |
| 304 | ++(*bbb); |
| 305 | CI->eraseFromParent(); |
| 306 | return true; |
| 307 | } |
| 308 | if (propagateSamplerInst(CI)) { |
| 309 | return false; |
| 310 | } |
| 311 | if (expandBFI(CI) || expandBFM(CI)) { |
| 312 | ++(*bbb); |
| 313 | CI->eraseFromParent(); |
| 314 | return true; |
| 315 | } |
| 316 | if (convertAccurateDivide(CI)) { |
| 317 | expandAccurateDivide(CI); |
| 318 | ++(*bbb); |
| 319 | CI->eraseFromParent(); |
| 320 | return true; |
| 321 | } |
| 322 | |
| 323 | StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); |
| 324 | if (calleeName.startswith("__amdil_is_constant")) { |
| 325 | // If we do not have optimizations, then this |
| 326 | // cannot be properly evaluated, so we add the |
| 327 | // call instruction to a vector and process |
| 328 | // them at the end of processing after the |
| 329 | // samplers have been correctly handled. |
| 330 | if (optLevel == CodeGenOpt::None) { |
| 331 | isConstVec.push_back(CI); |
| 332 | return false; |
| 333 | } else { |
| 334 | Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); |
| 335 | Type *aType = Type::getInt32Ty(*mCTX); |
| 336 | Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) |
| 337 | : ConstantInt::get(aType, 0); |
| 338 | CI->replaceAllUsesWith(Val); |
| 339 | ++(*bbb); |
| 340 | CI->eraseFromParent(); |
| 341 | return true; |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | if (calleeName.equals("__amdil_is_asic_id_i32")) { |
| 346 | ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); |
| 347 | Type *aType = Type::getInt32Ty(*mCTX); |
| 348 | Value *Val = CV; |
| 349 | if (Val) { |
| 350 | Val = ConstantInt::get(aType, |
| 351 | mSTM->device()->getDeviceFlag() & CV->getZExtValue()); |
| 352 | } else { |
| 353 | Val = ConstantInt::get(aType, 0); |
| 354 | } |
| 355 | CI->replaceAllUsesWith(Val); |
| 356 | ++(*bbb); |
| 357 | CI->eraseFromParent(); |
| 358 | return true; |
| 359 | } |
| 360 | Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); |
| 361 | if (!F) { |
| 362 | return false; |
| 363 | } |
| 364 | if (F->getName().startswith("__atom") && !CI->getNumUses() |
| 365 | && F->getName().find("_xchg") == StringRef::npos) { |
| 366 | std::string buffer(F->getName().str() + "_noret"); |
| 367 | F = dyn_cast<Function>( |
| 368 | F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); |
| 369 | atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F)); |
| 370 | } |
| 371 | |
| 372 | if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) |
| 373 | && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { |
| 374 | return false; |
| 375 | } |
| 376 | if (!mConvertAtomics) { |
| 377 | return false; |
| 378 | } |
| 379 | StringRef name = F->getName(); |
| 380 | if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { |
| 381 | mConvertAtomics = false; |
| 382 | } |
| 383 | return false; |
| 384 | } |
| 385 | |
| 386 | bool |
| 387 | AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, |
| 388 | Instruction *&src, |
| 389 | Constant *&mask, |
| 390 | Constant *&shift) { |
| 391 | if (!base) { |
| 392 | if (mDebug) { |
| 393 | dbgs() << "Null pointer passed into function.\n"; |
| 394 | } |
| 395 | return false; |
| 396 | } |
| 397 | bool andOp = false; |
| 398 | if (base->getOpcode() == Instruction::Shl) { |
| 399 | shift = dyn_cast<Constant>(base->getOperand(1)); |
| 400 | } else if (base->getOpcode() == Instruction::And) { |
| 401 | mask = dyn_cast<Constant>(base->getOperand(1)); |
| 402 | andOp = true; |
| 403 | } else { |
| 404 | if (mDebug) { |
| 405 | dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; |
| 406 | } |
| 407 | // If the base is neither a Shl or a And, we don't fit any of the patterns above. |
| 408 | return false; |
| 409 | } |
| 410 | src = dyn_cast<Instruction>(base->getOperand(0)); |
| 411 | if (!src) { |
| 412 | if (mDebug) { |
| 413 | dbgs() << "Failed setup since the base operand is not an instruction!\n"; |
| 414 | } |
| 415 | return false; |
| 416 | } |
| 417 | // If we find an 'and' operation, then we don't need to |
| 418 | // find the next operation as we already know the |
| 419 | // bits that are valid at this point. |
| 420 | if (andOp) { |
| 421 | return true; |
| 422 | } |
| 423 | if (src->getOpcode() == Instruction::Shl && !shift) { |
| 424 | shift = dyn_cast<Constant>(src->getOperand(1)); |
| 425 | src = dyn_cast<Instruction>(src->getOperand(0)); |
| 426 | } else if (src->getOpcode() == Instruction::And && !mask) { |
| 427 | mask = dyn_cast<Constant>(src->getOperand(1)); |
| 428 | } |
| 429 | if (!mask && !shift) { |
| 430 | if (mDebug) { |
| 431 | dbgs() << "Failed setup since both mask and shift are NULL!\n"; |
| 432 | } |
| 433 | // Did not find a constant mask or a shift. |
| 434 | return false; |
| 435 | } |
| 436 | return true; |
| 437 | } |
| 438 | bool |
| 439 | AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { |
| 440 | if (!inst) { |
| 441 | return false; |
| 442 | } |
| 443 | if (!inst->isBinaryOp()) { |
| 444 | return false; |
| 445 | } |
| 446 | if (inst->getOpcode() != Instruction::Or) { |
| 447 | return false; |
| 448 | } |
| 449 | if (optLevel == CodeGenOpt::None) { |
| 450 | return false; |
| 451 | } |
| 452 | // We want to do an optimization on a sequence of ops that in the end equals a |
| 453 | // single ISA instruction. |
| 454 | // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) |
| 455 | // Some simplified versions of this pattern are as follows: |
| 456 | // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 |
| 457 | // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E |
| 458 | // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B |
| 459 | // (A & B) | (D << F) when (1 << F) >= B |
| 460 | // (A << C) | (D & E) when (1 << C) >= E |
| 461 | if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
| 462 | // The HD4XXX hardware doesn't support the ubit_insert instruction. |
| 463 | return false; |
| 464 | } |
| 465 | Type *aType = inst->getType(); |
| 466 | bool isVector = aType->isVectorTy(); |
| 467 | int numEle = 1; |
| 468 | // This optimization only works on 32bit integers. |
| 469 | if (aType->getScalarType() |
| 470 | != Type::getInt32Ty(inst->getContext())) { |
| 471 | return false; |
| 472 | } |
| 473 | if (isVector) { |
| 474 | const VectorType *VT = dyn_cast<VectorType>(aType); |
| 475 | numEle = VT->getNumElements(); |
| 476 | // We currently cannot support more than 4 elements in a intrinsic and we |
| 477 | // cannot support Vec3 types. |
| 478 | if (numEle > 4 || numEle == 3) { |
| 479 | return false; |
| 480 | } |
| 481 | } |
| 482 | // TODO: Handle vectors. |
| 483 | if (isVector) { |
| 484 | if (mDebug) { |
| 485 | dbgs() << "!!! Vectors are not supported yet!\n"; |
| 486 | } |
| 487 | return false; |
| 488 | } |
| 489 | Instruction *LHSSrc = NULL, *RHSSrc = NULL; |
| 490 | Constant *LHSMask = NULL, *RHSMask = NULL; |
| 491 | Constant *LHSShift = NULL, *RHSShift = NULL; |
| 492 | Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); |
| 493 | Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); |
| 494 | if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { |
| 495 | if (mDebug) { |
| 496 | dbgs() << "Found an OR Operation that failed setup!\n"; |
| 497 | inst->dump(); |
| 498 | if (LHS) { LHS->dump(); } |
| 499 | if (LHSSrc) { LHSSrc->dump(); } |
| 500 | if (LHSMask) { LHSMask->dump(); } |
| 501 | if (LHSShift) { LHSShift->dump(); } |
| 502 | } |
| 503 | // There was an issue with the setup for BitInsert. |
| 504 | return false; |
| 505 | } |
| 506 | if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { |
| 507 | if (mDebug) { |
| 508 | dbgs() << "Found an OR Operation that failed setup!\n"; |
| 509 | inst->dump(); |
| 510 | if (RHS) { RHS->dump(); } |
| 511 | if (RHSSrc) { RHSSrc->dump(); } |
| 512 | if (RHSMask) { RHSMask->dump(); } |
| 513 | if (RHSShift) { RHSShift->dump(); } |
| 514 | } |
| 515 | // There was an issue with the setup for BitInsert. |
| 516 | return false; |
| 517 | } |
| 518 | if (mDebug) { |
| 519 | dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; |
| 520 | dbgs() << "Op: "; inst->dump(); |
| 521 | dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } |
| 522 | dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } |
| 523 | dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } |
| 524 | dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } |
| 525 | dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } |
| 526 | dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } |
| 527 | dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } |
| 528 | dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } |
| 529 | } |
| 530 | Constant *offset = NULL; |
| 531 | Constant *width = NULL; |
| 532 | uint32_t lhsMaskVal = 0, rhsMaskVal = 0; |
| 533 | uint32_t lhsShiftVal = 0, rhsShiftVal = 0; |
| 534 | uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; |
| 535 | uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; |
| 536 | lhsMaskVal = (LHSMask |
| 537 | ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); |
| 538 | rhsMaskVal = (RHSMask |
| 539 | ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); |
| 540 | lhsShiftVal = (LHSShift |
| 541 | ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); |
| 542 | rhsShiftVal = (RHSShift |
| 543 | ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); |
| 544 | lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; |
| 545 | rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; |
| 546 | lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; |
| 547 | rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; |
| 548 | // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). |
| 549 | if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { |
| 550 | return false; |
| 551 | } |
| 552 | if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { |
| 553 | offset = ConstantInt::get(aType, lhsMaskOffset, false); |
| 554 | width = ConstantInt::get(aType, lhsMaskWidth, false); |
| 555 | RHSSrc = RHS; |
| 556 | if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { |
| 557 | return false; |
| 558 | } |
| 559 | if (!LHSShift) { |
| 560 | LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 561 | "MaskShr", LHS); |
| 562 | } else if (lhsShiftVal != lhsMaskOffset) { |
| 563 | LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 564 | "MaskShr", LHS); |
| 565 | } |
| 566 | if (mDebug) { |
| 567 | dbgs() << "Optimizing LHS!\n"; |
| 568 | } |
| 569 | } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { |
| 570 | offset = ConstantInt::get(aType, rhsMaskOffset, false); |
| 571 | width = ConstantInt::get(aType, rhsMaskWidth, false); |
| 572 | LHSSrc = RHSSrc; |
| 573 | RHSSrc = LHS; |
| 574 | if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { |
| 575 | return false; |
| 576 | } |
| 577 | if (!RHSShift) { |
| 578 | LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 579 | "MaskShr", RHS); |
| 580 | } else if (rhsShiftVal != rhsMaskOffset) { |
| 581 | LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, |
| 582 | "MaskShr", RHS); |
| 583 | } |
| 584 | if (mDebug) { |
| 585 | dbgs() << "Optimizing RHS!\n"; |
| 586 | } |
| 587 | } else { |
| 588 | if (mDebug) { |
| 589 | dbgs() << "Failed constraint 3!\n"; |
| 590 | } |
| 591 | return false; |
| 592 | } |
| 593 | if (mDebug) { |
| 594 | dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } |
| 595 | dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } |
| 596 | dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } |
| 597 | dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } |
| 598 | } |
| 599 | if (!offset || !width) { |
| 600 | if (mDebug) { |
| 601 | dbgs() << "Either width or offset are NULL, failed detection!\n"; |
| 602 | } |
| 603 | return false; |
| 604 | } |
| 605 | // Lets create the function signature. |
| 606 | std::vector<Type *> callTypes; |
| 607 | callTypes.push_back(aType); |
| 608 | callTypes.push_back(aType); |
| 609 | callTypes.push_back(aType); |
| 610 | callTypes.push_back(aType); |
| 611 | FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
| 612 | std::string name = "__amdil_ubit_insert"; |
| 613 | if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } |
| 614 | Function *Func = |
| 615 | dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
| 616 | getOrInsertFunction(llvm::StringRef(name), funcType)); |
| 617 | Value *Operands[4] = { |
| 618 | width, |
| 619 | offset, |
| 620 | LHSSrc, |
| 621 | RHSSrc |
| 622 | }; |
| 623 | CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); |
| 624 | if (mDebug) { |
| 625 | dbgs() << "Old Inst: "; |
| 626 | inst->dump(); |
| 627 | dbgs() << "New Inst: "; |
| 628 | CI->dump(); |
| 629 | dbgs() << "\n\n"; |
| 630 | } |
| 631 | CI->insertBefore(inst); |
| 632 | inst->replaceAllUsesWith(CI); |
| 633 | return true; |
| 634 | } |
| 635 | |
| 636 | bool |
| 637 | AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { |
| 638 | if (!inst) { |
| 639 | return false; |
| 640 | } |
| 641 | if (!inst->isBinaryOp()) { |
| 642 | return false; |
| 643 | } |
| 644 | if (inst->getOpcode() != Instruction::And) { |
| 645 | return false; |
| 646 | } |
| 647 | if (optLevel == CodeGenOpt::None) { |
| 648 | return false; |
| 649 | } |
| 650 | // We want to do some simple optimizations on Shift right/And patterns. The |
| 651 | // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a |
| 652 | // value smaller than 32 and C is a mask. If C is a constant value, then the |
| 653 | // following transformation can occur. For signed integers, it turns into the |
| 654 | // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned |
| 655 | // integers, it turns into the function call dst = |
| 656 | // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract |
| 657 | // can be found in Section 7.9 of the ATI IL spec of the stream SDK for |
| 658 | // Evergreen hardware. |
| 659 | if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { |
| 660 | // This does not work on HD4XXX hardware. |
| 661 | return false; |
| 662 | } |
| 663 | Type *aType = inst->getType(); |
| 664 | bool isVector = aType->isVectorTy(); |
| 665 | |
| 666 | // XXX Support vector types |
| 667 | if (isVector) { |
| 668 | return false; |
| 669 | } |
| 670 | int numEle = 1; |
| 671 | // This only works on 32bit integers |
| 672 | if (aType->getScalarType() |
| 673 | != Type::getInt32Ty(inst->getContext())) { |
| 674 | return false; |
| 675 | } |
| 676 | if (isVector) { |
| 677 | const VectorType *VT = dyn_cast<VectorType>(aType); |
| 678 | numEle = VT->getNumElements(); |
| 679 | // We currently cannot support more than 4 elements in a intrinsic and we |
| 680 | // cannot support Vec3 types. |
| 681 | if (numEle > 4 || numEle == 3) { |
| 682 | return false; |
| 683 | } |
| 684 | } |
| 685 | BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); |
| 686 | // If the first operand is not a shift instruction, then we can return as it |
| 687 | // doesn't match this pattern. |
| 688 | if (!ShiftInst || !ShiftInst->isShift()) { |
| 689 | return false; |
| 690 | } |
| 691 | // If we are a shift left, then we need don't match this pattern. |
| 692 | if (ShiftInst->getOpcode() == Instruction::Shl) { |
| 693 | return false; |
| 694 | } |
| 695 | bool isSigned = ShiftInst->isArithmeticShift(); |
| 696 | Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); |
| 697 | Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); |
| 698 | // Lets make sure that the shift value and the and mask are constant integers. |
| 699 | if (!AndMask || !ShrVal) { |
| 700 | return false; |
| 701 | } |
| 702 | Constant *newMaskConst; |
| 703 | Constant *shiftValConst; |
| 704 | if (isVector) { |
| 705 | // Handle the vector case |
| 706 | std::vector<Constant *> maskVals; |
| 707 | std::vector<Constant *> shiftVals; |
| 708 | ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); |
| 709 | ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); |
| 710 | Type *scalarType = AndMaskVec->getType()->getScalarType(); |
| 711 | assert(AndMaskVec->getNumOperands() == |
| 712 | ShrValVec->getNumOperands() && "cannot have a " |
| 713 | "combination where the number of elements to a " |
| 714 | "shift and an and are different!"); |
| 715 | for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { |
| 716 | ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); |
| 717 | ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); |
| 718 | if (!AndCI || !ShiftIC) { |
| 719 | return false; |
| 720 | } |
| 721 | uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); |
| 722 | if (!isMask_32(maskVal)) { |
| 723 | return false; |
| 724 | } |
| 725 | maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
| 726 | uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); |
| 727 | // If the mask or shiftval is greater than the bitcount, then break out. |
| 728 | if (maskVal >= 32 || shiftVal >= 32) { |
| 729 | return false; |
| 730 | } |
| 731 | // If the mask val is greater than the the number of original bits left |
| 732 | // then this optimization is invalid. |
| 733 | if (maskVal > (32 - shiftVal)) { |
| 734 | return false; |
| 735 | } |
| 736 | maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); |
| 737 | shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); |
| 738 | } |
| 739 | newMaskConst = ConstantVector::get(maskVals); |
| 740 | shiftValConst = ConstantVector::get(shiftVals); |
| 741 | } else { |
| 742 | // Handle the scalar case |
| 743 | uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); |
| 744 | // This must be a mask value where all lower bits are set to 1 and then any |
| 745 | // bit higher is set to 0. |
| 746 | if (!isMask_32(maskVal)) { |
| 747 | return false; |
| 748 | } |
| 749 | maskVal = (uint32_t)CountTrailingOnes_32(maskVal); |
| 750 | // Count the number of bits set in the mask, this is the width of the |
| 751 | // resulting bit set that is extracted from the source value. |
| 752 | uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); |
| 753 | // If the mask or shift val is greater than the bitcount, then break out. |
| 754 | if (maskVal >= 32 || shiftVal >= 32) { |
| 755 | return false; |
| 756 | } |
| 757 | // If the mask val is greater than the the number of original bits left then |
| 758 | // this optimization is invalid. |
| 759 | if (maskVal > (32 - shiftVal)) { |
| 760 | return false; |
| 761 | } |
| 762 | newMaskConst = ConstantInt::get(aType, maskVal, isSigned); |
| 763 | shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); |
| 764 | } |
| 765 | // Lets create the function signature. |
| 766 | std::vector<Type *> callTypes; |
| 767 | callTypes.push_back(aType); |
| 768 | callTypes.push_back(aType); |
| 769 | callTypes.push_back(aType); |
| 770 | FunctionType *funcType = FunctionType::get(aType, callTypes, false); |
| 771 | std::string name = "llvm.AMDGPU.bit.extract.u32"; |
| 772 | if (isVector) { |
| 773 | name += ".v" + itostr(numEle) + "i32"; |
| 774 | } else { |
| 775 | name += "."; |
| 776 | } |
| 777 | // Lets create the function. |
| 778 | Function *Func = |
| 779 | dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> |
| 780 | getOrInsertFunction(llvm::StringRef(name), funcType)); |
| 781 | Value *Operands[3] = { |
| 782 | ShiftInst->getOperand(0), |
| 783 | shiftValConst, |
| 784 | newMaskConst |
| 785 | }; |
| 786 | // Lets create the Call with the operands |
| 787 | CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); |
| 788 | CI->setDoesNotAccessMemory(); |
| 789 | CI->insertBefore(inst); |
| 790 | inst->replaceAllUsesWith(CI); |
| 791 | return true; |
| 792 | } |
| 793 | |
| 794 | bool |
| 795 | AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { |
| 796 | if (!CI) { |
| 797 | return false; |
| 798 | } |
| 799 | Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 800 | if (!LHS->getName().startswith("__amdil_bfi")) { |
| 801 | return false; |
| 802 | } |
| 803 | Type* type = CI->getOperand(0)->getType(); |
| 804 | Constant *negOneConst = NULL; |
| 805 | if (type->isVectorTy()) { |
| 806 | std::vector<Constant *> negOneVals; |
| 807 | negOneConst = ConstantInt::get(CI->getContext(), |
| 808 | APInt(32, StringRef("-1"), 10)); |
| 809 | for (size_t x = 0, |
| 810 | y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
| 811 | negOneVals.push_back(negOneConst); |
| 812 | } |
| 813 | negOneConst = ConstantVector::get(negOneVals); |
| 814 | } else { |
| 815 | negOneConst = ConstantInt::get(CI->getContext(), |
| 816 | APInt(32, StringRef("-1"), 10)); |
| 817 | } |
| 818 | // __amdil_bfi => (A & B) | (~A & C) |
| 819 | BinaryOperator *lhs = |
| 820 | BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
| 821 | CI->getOperand(1), "bfi_and", CI); |
| 822 | BinaryOperator *rhs = |
| 823 | BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, |
| 824 | "bfi_not", CI); |
| 825 | rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), |
| 826 | "bfi_and", CI); |
| 827 | lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); |
| 828 | CI->replaceAllUsesWith(lhs); |
| 829 | return true; |
| 830 | } |
| 831 | |
| 832 | bool |
| 833 | AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { |
| 834 | if (!CI) { |
| 835 | return false; |
| 836 | } |
| 837 | Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 838 | if (!LHS->getName().startswith("__amdil_bfm")) { |
| 839 | return false; |
| 840 | } |
| 841 | // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) |
| 842 | Constant *newMaskConst = NULL; |
| 843 | Constant *newShiftConst = NULL; |
| 844 | Type* type = CI->getOperand(0)->getType(); |
| 845 | if (type->isVectorTy()) { |
| 846 | std::vector<Constant*> newMaskVals, newShiftVals; |
| 847 | newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
| 848 | newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
| 849 | for (size_t x = 0, |
| 850 | y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { |
| 851 | newMaskVals.push_back(newMaskConst); |
| 852 | newShiftVals.push_back(newShiftConst); |
| 853 | } |
| 854 | newMaskConst = ConstantVector::get(newMaskVals); |
| 855 | newShiftConst = ConstantVector::get(newShiftVals); |
| 856 | } else { |
| 857 | newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); |
| 858 | newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); |
| 859 | } |
| 860 | BinaryOperator *lhs = |
| 861 | BinaryOperator::Create(Instruction::And, CI->getOperand(0), |
| 862 | newMaskConst, "bfm_mask", CI); |
| 863 | lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, |
| 864 | lhs, "bfm_shl", CI); |
| 865 | lhs = BinaryOperator::Create(Instruction::Sub, lhs, |
| 866 | newShiftConst, "bfm_sub", CI); |
| 867 | BinaryOperator *rhs = |
| 868 | BinaryOperator::Create(Instruction::And, CI->getOperand(1), |
| 869 | newMaskConst, "bfm_mask", CI); |
| 870 | lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); |
| 871 | CI->replaceAllUsesWith(lhs); |
| 872 | return true; |
| 873 | } |
| 874 | |
| 875 | bool |
| 876 | AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { |
| 877 | Instruction *inst = (*bbb); |
| 878 | if (optimizeCallInst(bbb)) { |
| 879 | return true; |
| 880 | } |
| 881 | if (optimizeBitExtract(inst)) { |
| 882 | return false; |
| 883 | } |
| 884 | if (optimizeBitInsert(inst)) { |
| 885 | return false; |
| 886 | } |
| 887 | if (correctMisalignedMemOp(inst)) { |
| 888 | return false; |
| 889 | } |
| 890 | return false; |
| 891 | } |
| 892 | bool |
| 893 | AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { |
| 894 | LoadInst *linst = dyn_cast<LoadInst>(inst); |
| 895 | StoreInst *sinst = dyn_cast<StoreInst>(inst); |
| 896 | unsigned alignment; |
| 897 | Type* Ty = inst->getType(); |
| 898 | if (linst) { |
| 899 | alignment = linst->getAlignment(); |
| 900 | Ty = inst->getType(); |
| 901 | } else if (sinst) { |
| 902 | alignment = sinst->getAlignment(); |
| 903 | Ty = sinst->getValueOperand()->getType(); |
| 904 | } else { |
| 905 | return false; |
| 906 | } |
| 907 | unsigned size = getTypeSize(Ty); |
| 908 | if (size == alignment || size < alignment) { |
| 909 | return false; |
| 910 | } |
| 911 | if (!Ty->isStructTy()) { |
| 912 | return false; |
| 913 | } |
| 914 | if (alignment < 4) { |
| 915 | if (linst) { |
| 916 | linst->setAlignment(0); |
| 917 | return true; |
| 918 | } else if (sinst) { |
| 919 | sinst->setAlignment(0); |
| 920 | return true; |
| 921 | } |
| 922 | } |
| 923 | return false; |
| 924 | } |
| 925 | bool |
| 926 | AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { |
| 927 | if (!CI) { |
| 928 | return false; |
| 929 | } |
| 930 | Value *LHS = CI->getOperand(CI->getNumOperands() - 1); |
| 931 | std::string namePrefix = LHS->getName().substr(0, 14); |
| 932 | if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" |
| 933 | && namePrefix != "__amdil__imul24_high") { |
| 934 | return false; |
| 935 | } |
| 936 | if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { |
| 937 | return false; |
| 938 | } |
| 939 | return true; |
| 940 | } |
| 941 | |
| 942 | void |
| 943 | AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { |
| 944 | assert(isSigned24BitOps(CI) && "Must be a " |
| 945 | "signed 24 bit operation to call this function!"); |
| 946 | Value *LHS = CI->getOperand(CI->getNumOperands()-1); |
| 947 | // On 7XX and 8XX we do not have signed 24bit, so we need to |
| 948 | // expand it to the following: |
| 949 | // imul24 turns into 32bit imul |
| 950 | // imad24 turns into 32bit imad |
| 951 | // imul24_high turns into 32bit imulhigh |
| 952 | if (LHS->getName().substr(0, 14) == "__amdil_imad24") { |
| 953 | Type *aType = CI->getOperand(0)->getType(); |
| 954 | bool isVector = aType->isVectorTy(); |
| 955 | int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
| 956 | std::vector<Type*> callTypes; |
| 957 | callTypes.push_back(CI->getOperand(0)->getType()); |
| 958 | callTypes.push_back(CI->getOperand(1)->getType()); |
| 959 | callTypes.push_back(CI->getOperand(2)->getType()); |
| 960 | FunctionType *funcType = |
| 961 | FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
| 962 | std::string name = "__amdil_imad"; |
| 963 | if (isVector) { |
| 964 | name += "_v" + itostr(numEle) + "i32"; |
| 965 | } else { |
| 966 | name += "_i32"; |
| 967 | } |
| 968 | Function *Func = dyn_cast<Function>( |
| 969 | CI->getParent()->getParent()->getParent()-> |
| 970 | getOrInsertFunction(llvm::StringRef(name), funcType)); |
| 971 | Value *Operands[3] = { |
| 972 | CI->getOperand(0), |
| 973 | CI->getOperand(1), |
| 974 | CI->getOperand(2) |
| 975 | }; |
| 976 | CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); |
| 977 | nCI->insertBefore(CI); |
| 978 | CI->replaceAllUsesWith(nCI); |
| 979 | } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { |
| 980 | BinaryOperator *mulOp = |
| 981 | BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), |
| 982 | CI->getOperand(1), "imul24", CI); |
| 983 | CI->replaceAllUsesWith(mulOp); |
| 984 | } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { |
| 985 | Type *aType = CI->getOperand(0)->getType(); |
| 986 | |
| 987 | bool isVector = aType->isVectorTy(); |
| 988 | int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; |
| 989 | std::vector<Type*> callTypes; |
| 990 | callTypes.push_back(CI->getOperand(0)->getType()); |
| 991 | callTypes.push_back(CI->getOperand(1)->getType()); |
| 992 | FunctionType *funcType = |
| 993 | FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); |
| 994 | std::string name = "__amdil_imul_high"; |
| 995 | if (isVector) { |
| 996 | name += "_v" + itostr(numEle) + "i32"; |
| 997 | } else { |
| 998 | name += "_i32"; |
| 999 | } |
| 1000 | Function *Func = dyn_cast<Function>( |
| 1001 | CI->getParent()->getParent()->getParent()-> |
| 1002 | getOrInsertFunction(llvm::StringRef(name), funcType)); |
| 1003 | Value *Operands[2] = { |
| 1004 | CI->getOperand(0), |
| 1005 | CI->getOperand(1) |
| 1006 | }; |
| 1007 | CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); |
| 1008 | nCI->insertBefore(CI); |
| 1009 | CI->replaceAllUsesWith(nCI); |
| 1010 | } |
| 1011 | } |
| 1012 | |
| 1013 | bool |
| 1014 | AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { |
| 1015 | return (CI != NULL |
| 1016 | && CI->getOperand(CI->getNumOperands() - 1)->getName() |
| 1017 | == "__amdil_get_local_size_int"); |
| 1018 | } |
| 1019 | |
| 1020 | bool |
| 1021 | AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { |
| 1022 | if (!CI) { |
| 1023 | return false; |
| 1024 | } |
| 1025 | if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX |
| 1026 | && (mSTM->getDeviceName() == "cayman")) { |
| 1027 | return false; |
| 1028 | } |
| 1029 | return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) |
| 1030 | == "__amdil_improved_div"; |
| 1031 | } |
| 1032 | |
| 1033 | void |
| 1034 | AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { |
| 1035 | assert(convertAccurateDivide(CI) |
| 1036 | && "expanding accurate divide can only happen if it is expandable!"); |
| 1037 | BinaryOperator *divOp = |
| 1038 | BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), |
| 1039 | CI->getOperand(1), "fdiv32", CI); |
| 1040 | CI->replaceAllUsesWith(divOp); |
| 1041 | } |
| 1042 | |
| 1043 | bool |
| 1044 | AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { |
| 1045 | if (optLevel != CodeGenOpt::None) { |
| 1046 | return false; |
| 1047 | } |
| 1048 | |
| 1049 | if (!CI) { |
| 1050 | return false; |
| 1051 | } |
| 1052 | |
| 1053 | unsigned funcNameIdx = 0; |
| 1054 | funcNameIdx = CI->getNumOperands() - 1; |
| 1055 | StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); |
| 1056 | if (calleeName != "__amdil_image2d_read_norm" |
| 1057 | && calleeName != "__amdil_image2d_read_unnorm" |
| 1058 | && calleeName != "__amdil_image3d_read_norm" |
| 1059 | && calleeName != "__amdil_image3d_read_unnorm") { |
| 1060 | return false; |
| 1061 | } |
| 1062 | |
| 1063 | unsigned samplerIdx = 2; |
| 1064 | samplerIdx = 1; |
| 1065 | Value *sampler = CI->getOperand(samplerIdx); |
| 1066 | LoadInst *lInst = dyn_cast<LoadInst>(sampler); |
| 1067 | if (!lInst) { |
| 1068 | return false; |
| 1069 | } |
| 1070 | |
| 1071 | if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { |
| 1072 | return false; |
| 1073 | } |
| 1074 | |
| 1075 | GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); |
| 1076 | // If we are loading from what is not a global value, then we |
| 1077 | // fail and return. |
| 1078 | if (!gv) { |
| 1079 | return false; |
| 1080 | } |
| 1081 | |
| 1082 | // If we don't have an initializer or we have an initializer and |
| 1083 | // the initializer is not a 32bit integer, we fail. |
| 1084 | if (!gv->hasInitializer() |
| 1085 | || !gv->getInitializer()->getType()->isIntegerTy(32)) { |
| 1086 | return false; |
| 1087 | } |
| 1088 | |
| 1089 | // Now that we have the global variable initializer, lets replace |
| 1090 | // all uses of the load instruction with the samplerVal and |
| 1091 | // reparse the __amdil_is_constant() function. |
| 1092 | Constant *samplerVal = gv->getInitializer(); |
| 1093 | lInst->replaceAllUsesWith(samplerVal); |
| 1094 | return true; |
| 1095 | } |
| 1096 | |
| 1097 | bool |
| 1098 | AMDGPUPeepholeOpt::doInitialization(Module &M) { |
| 1099 | return false; |
| 1100 | } |
| 1101 | |
| 1102 | bool |
| 1103 | AMDGPUPeepholeOpt::doFinalization(Module &M) { |
| 1104 | return false; |
| 1105 | } |
| 1106 | |
| 1107 | void |
| 1108 | AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { |
| 1109 | AU.addRequired<MachineFunctionAnalysis>(); |
| 1110 | FunctionPass::getAnalysisUsage(AU); |
| 1111 | AU.setPreservesAll(); |
| 1112 | } |
| 1113 | |
| 1114 | size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { |
| 1115 | size_t size = 0; |
| 1116 | if (!T) { |
| 1117 | return size; |
| 1118 | } |
| 1119 | switch (T->getTypeID()) { |
| 1120 | case Type::X86_FP80TyID: |
| 1121 | case Type::FP128TyID: |
| 1122 | case Type::PPC_FP128TyID: |
| 1123 | case Type::LabelTyID: |
| 1124 | assert(0 && "These types are not supported by this backend"); |
| 1125 | default: |
| 1126 | case Type::FloatTyID: |
| 1127 | case Type::DoubleTyID: |
| 1128 | size = T->getPrimitiveSizeInBits() >> 3; |
| 1129 | break; |
| 1130 | case Type::PointerTyID: |
| 1131 | size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); |
| 1132 | break; |
| 1133 | case Type::IntegerTyID: |
| 1134 | size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); |
| 1135 | break; |
| 1136 | case Type::StructTyID: |
| 1137 | size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); |
| 1138 | break; |
| 1139 | case Type::ArrayTyID: |
| 1140 | size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); |
| 1141 | break; |
| 1142 | case Type::FunctionTyID: |
| 1143 | size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); |
| 1144 | break; |
| 1145 | case Type::VectorTyID: |
| 1146 | size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); |
| 1147 | break; |
| 1148 | }; |
| 1149 | return size; |
| 1150 | } |
| 1151 | |
| 1152 | size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, |
| 1153 | bool dereferencePtr) { |
| 1154 | size_t size = 0; |
| 1155 | if (!ST) { |
| 1156 | return size; |
| 1157 | } |
| 1158 | Type *curType; |
| 1159 | StructType::element_iterator eib; |
| 1160 | StructType::element_iterator eie; |
| 1161 | for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { |
| 1162 | curType = *eib; |
| 1163 | size += getTypeSize(curType, dereferencePtr); |
| 1164 | } |
| 1165 | return size; |
| 1166 | } |
| 1167 | |
| 1168 | size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, |
| 1169 | bool dereferencePtr) { |
| 1170 | return IT ? (IT->getBitWidth() >> 3) : 0; |
| 1171 | } |
| 1172 | |
| 1173 | size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, |
| 1174 | bool dereferencePtr) { |
| 1175 | assert(0 && "Should not be able to calculate the size of an function type"); |
| 1176 | return 0; |
| 1177 | } |
| 1178 | |
| 1179 | size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, |
| 1180 | bool dereferencePtr) { |
| 1181 | return (size_t)(AT ? (getTypeSize(AT->getElementType(), |
| 1182 | dereferencePtr) * AT->getNumElements()) |
| 1183 | : 0); |
| 1184 | } |
| 1185 | |
| 1186 | size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, |
| 1187 | bool dereferencePtr) { |
| 1188 | return VT ? (VT->getBitWidth() >> 3) : 0; |
| 1189 | } |
| 1190 | |
| 1191 | size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, |
| 1192 | bool dereferencePtr) { |
| 1193 | if (!PT) { |
| 1194 | return 0; |
| 1195 | } |
| 1196 | Type *CT = PT->getElementType(); |
| 1197 | if (CT->getTypeID() == Type::StructTyID && |
| 1198 | PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { |
| 1199 | return getTypeSize(dyn_cast<StructType>(CT)); |
| 1200 | } else if (dereferencePtr) { |
| 1201 | size_t size = 0; |
| 1202 | for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { |
| 1203 | size += getTypeSize(PT->getContainedType(x), dereferencePtr); |
| 1204 | } |
| 1205 | return size; |
| 1206 | } else { |
| 1207 | return 4; |
| 1208 | } |
| 1209 | } |
| 1210 | |
| 1211 | size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, |
| 1212 | bool dereferencePtr) { |
| 1213 | //assert(0 && "Should not be able to calculate the size of an opaque type"); |
| 1214 | return 4; |
| 1215 | } |