|  | //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// | 
|  | // | 
|  | //                     The LLVM Compiler Infrastructure | 
|  | // | 
|  | // This file is distributed under the University of Illinois Open Source | 
|  | // License. See LICENSE.TXT for details. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | // This file contains an optimization for div and rem on architectures that | 
|  | // execute short instructions significantly faster than longer instructions. | 
|  | // For example, on Intel Atom 32-bit divides are slow enough that during | 
|  | // runtime it is profitable to check the value of the operands, and if they are | 
|  | // positive and less than 256 use an unsigned 8-bit divide. | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "llvm/Transforms/Utils/BypassSlowDivision.h" | 
|  | #include "llvm/ADT/DenseMap.h" | 
|  | #include "llvm/ADT/None.h" | 
|  | #include "llvm/ADT/Optional.h" | 
|  | #include "llvm/ADT/STLExtras.h" | 
|  | #include "llvm/ADT/SmallPtrSet.h" | 
|  | #include "llvm/Analysis/ValueTracking.h" | 
|  | #include "llvm/IR/BasicBlock.h" | 
|  | #include "llvm/IR/Constants.h" | 
|  | #include "llvm/IR/DerivedTypes.h" | 
|  | #include "llvm/IR/Function.h" | 
|  | #include "llvm/IR/IRBuilder.h" | 
|  | #include "llvm/IR/Instruction.h" | 
|  | #include "llvm/IR/Instructions.h" | 
|  | #include "llvm/IR/Module.h" | 
|  | #include "llvm/IR/Type.h" | 
|  | #include "llvm/IR/Value.h" | 
|  | #include "llvm/Support/Casting.h" | 
|  | #include "llvm/Support/KnownBits.h" | 
|  | #include "llvm/Transforms/Utils/Local.h" | 
|  | #include <cassert> | 
|  | #include <cstdint> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "bypass-slow-division" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | struct QuotRemPair { | 
|  | Value *Quotient; | 
|  | Value *Remainder; | 
|  |  | 
|  | QuotRemPair(Value *InQuotient, Value *InRemainder) | 
|  | : Quotient(InQuotient), Remainder(InRemainder) {} | 
|  | }; | 
|  |  | 
|  | /// A quotient and remainder, plus a BB from which they logically "originate". | 
|  | /// If you use Quotient or Remainder in a Phi node, you should use BB as its | 
|  | /// corresponding predecessor. | 
|  | struct QuotRemWithBB { | 
|  | BasicBlock *BB = nullptr; | 
|  | Value *Quotient = nullptr; | 
|  | Value *Remainder = nullptr; | 
|  | }; | 
|  |  | 
|  | using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; | 
|  | using BypassWidthsTy = DenseMap<unsigned, unsigned>; | 
|  | using VisitedSetTy = SmallPtrSet<Instruction *, 4>; | 
|  |  | 
|  | enum ValueRange { | 
|  | /// Operand definitely fits into BypassType. No runtime checks are needed. | 
|  | VALRNG_KNOWN_SHORT, | 
|  | /// A runtime check is required, as value range is unknown. | 
|  | VALRNG_UNKNOWN, | 
|  | /// Operand is unlikely to fit into BypassType. The bypassing should be | 
|  | /// disabled. | 
|  | VALRNG_LIKELY_LONG | 
|  | }; | 
|  |  | 
|  | class FastDivInsertionTask { | 
|  | bool IsValidTask = false; | 
|  | Instruction *SlowDivOrRem = nullptr; | 
|  | IntegerType *BypassType = nullptr; | 
|  | BasicBlock *MainBB = nullptr; | 
|  |  | 
|  | bool isHashLikeValue(Value *V, VisitedSetTy &Visited); | 
|  | ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); | 
|  | QuotRemWithBB createSlowBB(BasicBlock *Successor); | 
|  | QuotRemWithBB createFastBB(BasicBlock *Successor); | 
|  | QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, | 
|  | BasicBlock *PhiBB); | 
|  | Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); | 
|  | Optional<QuotRemPair> insertFastDivAndRem(); | 
|  |  | 
|  | bool isSignedOp() { | 
|  | return SlowDivOrRem->getOpcode() == Instruction::SDiv || | 
|  | SlowDivOrRem->getOpcode() == Instruction::SRem; | 
|  | } | 
|  |  | 
|  | bool isDivisionOp() { | 
|  | return SlowDivOrRem->getOpcode() == Instruction::SDiv || | 
|  | SlowDivOrRem->getOpcode() == Instruction::UDiv; | 
|  | } | 
|  |  | 
|  | Type *getSlowType() { return SlowDivOrRem->getType(); } | 
|  |  | 
|  | public: | 
|  | FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); | 
|  |  | 
|  | Value *getReplacement(DivCacheTy &Cache); | 
|  | }; | 
|  |  | 
|  | } // end anonymous namespace | 
|  |  | 
|  | FastDivInsertionTask::FastDivInsertionTask(Instruction *I, | 
|  | const BypassWidthsTy &BypassWidths) { | 
|  | switch (I->getOpcode()) { | 
|  | case Instruction::UDiv: | 
|  | case Instruction::SDiv: | 
|  | case Instruction::URem: | 
|  | case Instruction::SRem: | 
|  | SlowDivOrRem = I; | 
|  | break; | 
|  | default: | 
|  | // I is not a div/rem operation. | 
|  | return; | 
|  | } | 
|  |  | 
|  | // Skip division on vector types. Only optimize integer instructions. | 
|  | IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); | 
|  | if (!SlowType) | 
|  | return; | 
|  |  | 
|  | // Skip if this bitwidth is not bypassed. | 
|  | auto BI = BypassWidths.find(SlowType->getBitWidth()); | 
|  | if (BI == BypassWidths.end()) | 
|  | return; | 
|  |  | 
|  | // Get type for div/rem instruction with bypass bitwidth. | 
|  | IntegerType *BT = IntegerType::get(I->getContext(), BI->second); | 
|  | BypassType = BT; | 
|  |  | 
|  | // The original basic block. | 
|  | MainBB = I->getParent(); | 
|  |  | 
|  | // The instruction is indeed a slow div or rem operation. | 
|  | IsValidTask = true; | 
|  | } | 
|  |  | 
|  | /// Reuses previously-computed dividend or remainder from the current BB if | 
|  | /// operands and operation are identical. Otherwise calls insertFastDivAndRem to | 
|  | /// perform the optimization and caches the resulting dividend and remainder. | 
|  | /// If no replacement can be generated, nullptr is returned. | 
|  | Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { | 
|  | // First, make sure that the task is valid. | 
|  | if (!IsValidTask) | 
|  | return nullptr; | 
|  |  | 
|  | // Then, look for a value in Cache. | 
|  | Value *Dividend = SlowDivOrRem->getOperand(0); | 
|  | Value *Divisor = SlowDivOrRem->getOperand(1); | 
|  | DivRemMapKey Key(isSignedOp(), Dividend, Divisor); | 
|  | auto CacheI = Cache.find(Key); | 
|  |  | 
|  | if (CacheI == Cache.end()) { | 
|  | // If previous instance does not exist, try to insert fast div. | 
|  | Optional<QuotRemPair> OptResult = insertFastDivAndRem(); | 
|  | // Bail out if insertFastDivAndRem has failed. | 
|  | if (!OptResult) | 
|  | return nullptr; | 
|  | CacheI = Cache.insert({Key, *OptResult}).first; | 
|  | } | 
|  |  | 
|  | QuotRemPair &Value = CacheI->second; | 
|  | return isDivisionOp() ? Value.Quotient : Value.Remainder; | 
|  | } | 
|  |  | 
|  | /// \brief Check if a value looks like a hash. | 
|  | /// | 
|  | /// The routine is expected to detect values computed using the most common hash | 
|  | /// algorithms. Typically, hash computations end with one of the following | 
|  | /// instructions: | 
|  | /// | 
|  | /// 1) MUL with a constant wider than BypassType | 
|  | /// 2) XOR instruction | 
|  | /// | 
|  | /// And even if we are wrong and the value is not a hash, it is still quite | 
|  | /// unlikely that such values will fit into BypassType. | 
|  | /// | 
|  | /// To detect string hash algorithms like FNV we have to look through PHI-nodes. | 
|  | /// It is implemented as a depth-first search for values that look neither long | 
|  | /// nor hash-like. | 
|  | bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { | 
|  | Instruction *I = dyn_cast<Instruction>(V); | 
|  | if (!I) | 
|  | return false; | 
|  |  | 
|  | switch (I->getOpcode()) { | 
|  | case Instruction::Xor: | 
|  | return true; | 
|  | case Instruction::Mul: { | 
|  | // After Constant Hoisting pass, long constants may be represented as | 
|  | // bitcast instructions. As a result, some constants may look like an | 
|  | // instruction at first, and an additional check is necessary to find out if | 
|  | // an operand is actually a constant. | 
|  | Value *Op1 = I->getOperand(1); | 
|  | ConstantInt *C = dyn_cast<ConstantInt>(Op1); | 
|  | if (!C && isa<BitCastInst>(Op1)) | 
|  | C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); | 
|  | return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); | 
|  | } | 
|  | case Instruction::PHI: | 
|  | // Stop IR traversal in case of a crazy input code. This limits recursion | 
|  | // depth. | 
|  | if (Visited.size() >= 16) | 
|  | return false; | 
|  | // Do not visit nodes that have been visited already. We return true because | 
|  | // it means that we couldn't find any value that doesn't look hash-like. | 
|  | if (Visited.find(I) != Visited.end()) | 
|  | return true; | 
|  | Visited.insert(I); | 
|  | return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { | 
|  | // Ignore undef values as they probably don't affect the division | 
|  | // operands. | 
|  | return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || | 
|  | isa<UndefValue>(V); | 
|  | }); | 
|  | default: | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// Check if an integer value fits into our bypass type. | 
|  | ValueRange FastDivInsertionTask::getValueRange(Value *V, | 
|  | VisitedSetTy &Visited) { | 
|  | unsigned ShortLen = BypassType->getBitWidth(); | 
|  | unsigned LongLen = V->getType()->getIntegerBitWidth(); | 
|  |  | 
|  | assert(LongLen > ShortLen && "Value type must be wider than BypassType"); | 
|  | unsigned HiBits = LongLen - ShortLen; | 
|  |  | 
|  | const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); | 
|  | KnownBits Known(LongLen); | 
|  |  | 
|  | computeKnownBits(V, Known, DL); | 
|  |  | 
|  | if (Known.countMinLeadingZeros() >= HiBits) | 
|  | return VALRNG_KNOWN_SHORT; | 
|  |  | 
|  | if (Known.countMaxLeadingZeros() < HiBits) | 
|  | return VALRNG_LIKELY_LONG; | 
|  |  | 
|  | // Long integer divisions are often used in hashtable implementations. It's | 
|  | // not worth bypassing such divisions because hash values are extremely | 
|  | // unlikely to have enough leading zeros. The call below tries to detect | 
|  | // values that are unlikely to fit BypassType (including hashes). | 
|  | if (isHashLikeValue(V, Visited)) | 
|  | return VALRNG_LIKELY_LONG; | 
|  |  | 
|  | return VALRNG_UNKNOWN; | 
|  | } | 
|  |  | 
|  | /// Add new basic block for slow div and rem operations and put it before | 
|  | /// SuccessorBB. | 
|  | QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { | 
|  | QuotRemWithBB DivRemPair; | 
|  | DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", | 
|  | MainBB->getParent(), SuccessorBB); | 
|  | IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); | 
|  |  | 
|  | Value *Dividend = SlowDivOrRem->getOperand(0); | 
|  | Value *Divisor = SlowDivOrRem->getOperand(1); | 
|  |  | 
|  | if (isSignedOp()) { | 
|  | DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); | 
|  | DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); | 
|  | } else { | 
|  | DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); | 
|  | DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); | 
|  | } | 
|  |  | 
|  | Builder.CreateBr(SuccessorBB); | 
|  | return DivRemPair; | 
|  | } | 
|  |  | 
|  | /// Add new basic block for fast div and rem operations and put it before | 
|  | /// SuccessorBB. | 
|  | QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { | 
|  | QuotRemWithBB DivRemPair; | 
|  | DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", | 
|  | MainBB->getParent(), SuccessorBB); | 
|  | IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); | 
|  |  | 
|  | Value *Dividend = SlowDivOrRem->getOperand(0); | 
|  | Value *Divisor = SlowDivOrRem->getOperand(1); | 
|  | Value *ShortDivisorV = | 
|  | Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); | 
|  | Value *ShortDividendV = | 
|  | Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); | 
|  |  | 
|  | // udiv/urem because this optimization only handles positive numbers. | 
|  | Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); | 
|  | Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); | 
|  | DivRemPair.Quotient = | 
|  | Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); | 
|  | DivRemPair.Remainder = | 
|  | Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); | 
|  | Builder.CreateBr(SuccessorBB); | 
|  |  | 
|  | return DivRemPair; | 
|  | } | 
|  |  | 
|  | /// Creates Phi nodes for result of Div and Rem. | 
|  | QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, | 
|  | QuotRemWithBB &RHS, | 
|  | BasicBlock *PhiBB) { | 
|  | IRBuilder<> Builder(PhiBB, PhiBB->begin()); | 
|  | PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); | 
|  | QuoPhi->addIncoming(LHS.Quotient, LHS.BB); | 
|  | QuoPhi->addIncoming(RHS.Quotient, RHS.BB); | 
|  | PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); | 
|  | RemPhi->addIncoming(LHS.Remainder, LHS.BB); | 
|  | RemPhi->addIncoming(RHS.Remainder, RHS.BB); | 
|  | return QuotRemPair(QuoPhi, RemPhi); | 
|  | } | 
|  |  | 
|  | /// Creates a runtime check to test whether both the divisor and dividend fit | 
|  | /// into BypassType. The check is inserted at the end of MainBB. True return | 
|  | /// value means that the operands fit. Either of the operands may be NULL if it | 
|  | /// doesn't need a runtime check. | 
|  | Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { | 
|  | assert((Op1 || Op2) && "Nothing to check"); | 
|  | IRBuilder<> Builder(MainBB, MainBB->end()); | 
|  |  | 
|  | Value *OrV; | 
|  | if (Op1 && Op2) | 
|  | OrV = Builder.CreateOr(Op1, Op2); | 
|  | else | 
|  | OrV = Op1 ? Op1 : Op2; | 
|  |  | 
|  | // BitMask is inverted to check if the operands are | 
|  | // larger than the bypass type | 
|  | uint64_t BitMask = ~BypassType->getBitMask(); | 
|  | Value *AndV = Builder.CreateAnd(OrV, BitMask); | 
|  |  | 
|  | // Compare operand values | 
|  | Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); | 
|  | return Builder.CreateICmpEQ(AndV, ZeroV); | 
|  | } | 
|  |  | 
|  | /// Substitutes the div/rem instruction with code that checks the value of the | 
|  | /// operands and uses a shorter-faster div/rem instruction when possible. | 
|  | Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { | 
|  | Value *Dividend = SlowDivOrRem->getOperand(0); | 
|  | Value *Divisor = SlowDivOrRem->getOperand(1); | 
|  |  | 
|  | VisitedSetTy SetL; | 
|  | ValueRange DividendRange = getValueRange(Dividend, SetL); | 
|  | if (DividendRange == VALRNG_LIKELY_LONG) | 
|  | return None; | 
|  |  | 
|  | VisitedSetTy SetR; | 
|  | ValueRange DivisorRange = getValueRange(Divisor, SetR); | 
|  | if (DivisorRange == VALRNG_LIKELY_LONG) | 
|  | return None; | 
|  |  | 
|  | bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); | 
|  | bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); | 
|  |  | 
|  | if (DividendShort && DivisorShort) { | 
|  | // If both operands are known to be short then just replace the long | 
|  | // division with a short one in-place.  Since we're not introducing control | 
|  | // flow in this case, narrowing the division is always a win, even if the | 
|  | // divisor is a constant (and will later get replaced by a multiplication). | 
|  |  | 
|  | IRBuilder<> Builder(SlowDivOrRem); | 
|  | Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); | 
|  | Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); | 
|  | Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); | 
|  | Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); | 
|  | Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); | 
|  | Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); | 
|  | return QuotRemPair(ExtDiv, ExtRem); | 
|  | } | 
|  |  | 
|  | if (isa<ConstantInt>(Divisor)) { | 
|  | // If the divisor is not a constant, DAGCombiner will convert it to a | 
|  | // multiplication by a magic constant.  It isn't clear if it is worth | 
|  | // introducing control flow to get a narrower multiply. | 
|  | return None; | 
|  | } | 
|  |  | 
|  | if (DividendShort && !isSignedOp()) { | 
|  | // If the division is unsigned and Dividend is known to be short, then | 
|  | // either | 
|  | // 1) Divisor is less or equal to Dividend, and the result can be computed | 
|  | //    with a short division. | 
|  | // 2) Divisor is greater than Dividend. In this case, no division is needed | 
|  | //    at all: The quotient is 0 and the remainder is equal to Dividend. | 
|  | // | 
|  | // So instead of checking at runtime whether Divisor fits into BypassType, | 
|  | // we emit a runtime check to differentiate between these two cases. This | 
|  | // lets us entirely avoid a long div. | 
|  |  | 
|  | // Split the basic block before the div/rem. | 
|  | BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); | 
|  | // Remove the unconditional branch from MainBB to SuccessorBB. | 
|  | MainBB->getInstList().back().eraseFromParent(); | 
|  | QuotRemWithBB Long; | 
|  | Long.BB = MainBB; | 
|  | Long.Quotient = ConstantInt::get(getSlowType(), 0); | 
|  | Long.Remainder = Dividend; | 
|  | QuotRemWithBB Fast = createFastBB(SuccessorBB); | 
|  | QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); | 
|  | IRBuilder<> Builder(MainBB, MainBB->end()); | 
|  | Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); | 
|  | Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); | 
|  | return Result; | 
|  | } else { | 
|  | // General case. Create both slow and fast div/rem pairs and choose one of | 
|  | // them at runtime. | 
|  |  | 
|  | // Split the basic block before the div/rem. | 
|  | BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); | 
|  | // Remove the unconditional branch from MainBB to SuccessorBB. | 
|  | MainBB->getInstList().back().eraseFromParent(); | 
|  | QuotRemWithBB Fast = createFastBB(SuccessorBB); | 
|  | QuotRemWithBB Slow = createSlowBB(SuccessorBB); | 
|  | QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); | 
|  | Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, | 
|  | DivisorShort ? nullptr : Divisor); | 
|  | IRBuilder<> Builder(MainBB, MainBB->end()); | 
|  | Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); | 
|  | return Result; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// This optimization identifies DIV/REM instructions in a BB that can be | 
|  | /// profitably bypassed and carried out with a shorter, faster divide. | 
|  | bool llvm::bypassSlowDivision(BasicBlock *BB, | 
|  | const BypassWidthsTy &BypassWidths) { | 
|  | DivCacheTy PerBBDivCache; | 
|  |  | 
|  | bool MadeChange = false; | 
|  | Instruction* Next = &*BB->begin(); | 
|  | while (Next != nullptr) { | 
|  | // We may add instructions immediately after I, but we want to skip over | 
|  | // them. | 
|  | Instruction* I = Next; | 
|  | Next = Next->getNextNode(); | 
|  |  | 
|  | FastDivInsertionTask Task(I, BypassWidths); | 
|  | if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { | 
|  | I->replaceAllUsesWith(Replacement); | 
|  | I->eraseFromParent(); | 
|  | MadeChange = true; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Above we eagerly create divs and rems, as pairs, so that we can efficiently | 
|  | // create divrem machine instructions.  Now erase any unused divs / rems so we | 
|  | // don't leave extra instructions sitting around. | 
|  | for (auto &KV : PerBBDivCache) | 
|  | for (Value *V : {KV.second.Quotient, KV.second.Remainder}) | 
|  | RecursivelyDeleteTriviallyDeadInstructions(V); | 
|  |  | 
|  | return MadeChange; | 
|  | } |