Nadav Rotem | 2d9dec3 | 2013-04-09 19:44:35 +0000 | [diff] [blame] | 1 | //===- VecUtils.h --- Vectorization Utilities -----------------------------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | #define DEBUG_TYPE "VecUtils" |
| 10 | |
| 11 | #include "VecUtils.h" |
| 12 | #include "llvm/ADT/DenseMap.h" |
| 13 | #include "llvm/ADT/SmallPtrSet.h" |
| 14 | #include "llvm/ADT/SmallSet.h" |
| 15 | #include "llvm/ADT/SmallVector.h" |
| 16 | #include "llvm/Analysis/AliasAnalysis.h" |
| 17 | #include "llvm/Analysis/ScalarEvolution.h" |
| 18 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 19 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 20 | #include "llvm/Analysis/Verifier.h" |
| 21 | #include "llvm/IR/Constants.h" |
| 22 | #include "llvm/IR/DataLayout.h" |
| 23 | #include "llvm/IR/Function.h" |
| 24 | #include "llvm/IR/Instructions.h" |
| 25 | #include "llvm/IR/Module.h" |
| 26 | #include "llvm/IR/Type.h" |
| 27 | #include "llvm/IR/Value.h" |
| 28 | #include "llvm/Pass.h" |
| 29 | #include "llvm/Support/CommandLine.h" |
| 30 | #include "llvm/Support/Debug.h" |
| 31 | #include "llvm/Support/raw_ostream.h" |
| 32 | #include "llvm/Target/TargetLibraryInfo.h" |
| 33 | #include "llvm/Transforms/Scalar.h" |
| 34 | #include "llvm/Transforms/Utils/Local.h" |
| 35 | #include <algorithm> |
| 36 | #include <map> |
| 37 | |
| 38 | using namespace llvm; |
| 39 | |
| 40 | namespace llvm { |
| 41 | |
| 42 | BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, |
| 43 | TargetTransformInfo *Tti, AliasAnalysis *Aa) : |
| 44 | BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) { |
| 45 | numberInstructions(); |
| 46 | } |
| 47 | |
| 48 | void BoUpSLP::numberInstructions() { |
| 49 | int Loc = 0; |
| 50 | InstrIdx.clear(); |
| 51 | InstrVec.clear(); |
| 52 | // Number the instructions in the block. |
| 53 | for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { |
| 54 | InstrIdx[it] = Loc++; |
| 55 | InstrVec.push_back(it); |
| 56 | assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | Value *BoUpSLP::getPointerOperand(Value *I) { |
| 61 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand(); |
| 62 | if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); |
| 63 | return 0; |
| 64 | } |
| 65 | |
| 66 | unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { |
| 67 | if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace(); |
| 68 | if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace(); |
| 69 | return -1; |
| 70 | } |
| 71 | |
| 72 | bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { |
| 73 | Value *PtrA = getPointerOperand(A); |
| 74 | Value *PtrB = getPointerOperand(B); |
| 75 | unsigned ASA = getAddressSpaceOperand(A); |
| 76 | unsigned ASB = getAddressSpaceOperand(B); |
| 77 | |
| 78 | // Check that the address spaces match and that the pointers are valid. |
| 79 | if (!PtrA || !PtrB || (ASA != ASB)) return false; |
| 80 | |
| 81 | // Check that A and B are of the same type. |
| 82 | if (PtrA->getType() != PtrB->getType()) return false; |
| 83 | |
| 84 | // Calculate the distance. |
| 85 | const SCEV *PtrSCEVA = SE->getSCEV(PtrA); |
| 86 | const SCEV *PtrSCEVB = SE->getSCEV(PtrB); |
| 87 | const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); |
| 88 | const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV); |
| 89 | |
| 90 | // Non constant distance. |
| 91 | if (!ConstOffSCEV) return false; |
| 92 | |
| 93 | unsigned Offset = ConstOffSCEV->getValue()->getSExtValue(); |
| 94 | Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); |
| 95 | // The Instructions are connsecutive if the size of the first load/store is |
| 96 | // the same as the offset. |
Nadav Rotem | 88dd5f7 | 2013-04-10 18:57:27 +0000 | [diff] [blame] | 97 | unsigned Sz = DL->getTypeStoreSize(Ty); |
Nadav Rotem | 2d9dec3 | 2013-04-09 19:44:35 +0000 | [diff] [blame] | 98 | return ((-Offset) == Sz); |
| 99 | } |
| 100 | |
| 101 | bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) { |
| 102 | ValueSet Heads, Tails; |
| 103 | SmallDenseMap<Value*, Value*> ConsecutiveChain; |
| 104 | bool Changed = false; |
| 105 | |
| 106 | // Do a quadratic search on all of the given stores and find |
| 107 | // all of the pairs of loads that follow each other. |
| 108 | for (unsigned i = 0, e = Stores.size(); i < e; ++i) |
| 109 | for (unsigned j = 0; j < e; ++j) { |
| 110 | if (i == j) continue; |
| 111 | if (isConsecutiveAccess(Stores[i], Stores[j])) { |
| 112 | Tails.insert(Stores[j]); |
| 113 | Heads.insert(Stores[i]); |
| 114 | ConsecutiveChain[Stores[i]] = Stores[j]; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | // For stores that start but don't end a link in the chain: |
| 119 | for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) { |
| 120 | if (Tails.count(*it)) continue; |
| 121 | |
| 122 | // We found a store instr that starts a chain. Now follow the chain and try |
| 123 | // to vectorize it. |
| 124 | ValueList Operands; |
| 125 | Value *I = *it; |
| 126 | int MinCost = 0, MinVF = 0; |
| 127 | while (Tails.count(I) || Heads.count(I)) { |
| 128 | Operands.push_back(I); |
| 129 | unsigned VF = Operands.size(); |
| 130 | if (isPowerOf2_32(VF) && VF > 1) { |
| 131 | int cost = getTreeRollCost(Operands, 0); |
| 132 | DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n"); |
| 133 | if (cost < MinCost) { MinCost = cost; MinVF = VF; } |
| 134 | } |
| 135 | // Move to the next value in the chain. |
| 136 | I = ConsecutiveChain[I]; |
| 137 | } |
| 138 | |
| 139 | if (MinCost <= costThreshold && MinVF > 1) { |
| 140 | DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n"); |
| 141 | vectorizeTree(Operands, MinVF); |
| 142 | Stores.clear(); |
| 143 | // The current numbering is invalid because we added and removed instrs. |
| 144 | numberInstructions(); |
| 145 | Changed = true; |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | return Changed; |
| 150 | } |
| 151 | |
| 152 | int BoUpSLP::getScalarizationCost(Type *Ty) { |
| 153 | int Cost = 0; |
| 154 | for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) |
| 155 | Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); |
| 156 | return Cost; |
| 157 | } |
| 158 | |
| 159 | AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { |
| 160 | if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI); |
| 161 | if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI); |
| 162 | return AliasAnalysis::Location(); |
| 163 | } |
| 164 | |
| 165 | Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { |
| 166 | assert(Src->getParent() == Dst->getParent() && "Not the same BB"); |
| 167 | BasicBlock::iterator I = Src, E = Dst; |
| 168 | /// Scan all of the instruction from SRC to DST and check if |
| 169 | /// the source may alias. |
| 170 | for (++I; I != E; ++I) { |
| 171 | // Ignore store instructions that are marked as 'ignore'. |
| 172 | if (MemBarrierIgnoreList.count(I)) continue; |
| 173 | if (Src->mayWriteToMemory()) /* Write */ { |
| 174 | if (!I->mayReadOrWriteMemory()) continue; |
| 175 | } else /* Read */ { |
| 176 | if (!I->mayWriteToMemory()) continue; |
| 177 | } |
| 178 | AliasAnalysis::Location A = getLocation(&*I); |
| 179 | AliasAnalysis::Location B = getLocation(Src); |
| 180 | |
| 181 | if (!A.Ptr || !B.Ptr || AA->alias(A, B)) |
| 182 | return I; |
| 183 | } |
| 184 | return 0; |
| 185 | } |
| 186 | |
| 187 | int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) { |
| 188 | if (Depth == 6) return max_cost; |
| 189 | Type *ScalarTy = VL[0]->getType(); |
| 190 | |
| 191 | if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) |
| 192 | ScalarTy = SI->getValueOperand()->getType(); |
| 193 | |
| 194 | /// Don't mess with vectors. |
| 195 | if (ScalarTy->isVectorTy()) return max_cost; |
| 196 | |
| 197 | VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); |
| 198 | |
| 199 | // Check if all of the operands are constants. |
| 200 | bool AllConst = true; |
| 201 | bool AllSameScalar = true; |
| 202 | for (unsigned i = 0, e = VL.size(); i < e; ++i) { |
| 203 | AllConst &= isa<Constant>(VL[i]); |
| 204 | AllSameScalar &= (VL[0] == VL[i]); |
| 205 | // Must have a single use. |
| 206 | Instruction *I = dyn_cast<Instruction>(VL[i]); |
| 207 | // Need to scalarize instructions with multiple users or from other BBs. |
| 208 | if (I && ((I->getNumUses() > 1) || (I->getParent() != BB))) |
| 209 | return getScalarizationCost(VecTy); |
| 210 | } |
| 211 | |
| 212 | // Is this a simple vector constant. |
| 213 | if (AllConst) return 0; |
| 214 | |
| 215 | // If all of the operands are identical we can broadcast them. |
| 216 | if (AllSameScalar) |
| 217 | return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); |
| 218 | |
| 219 | // Scalarize unknown structures. |
| 220 | Instruction *VL0 = dyn_cast<Instruction>(VL[0]); |
| 221 | if (!VL0) return getScalarizationCost(VecTy); |
| 222 | assert(VL0->getParent() == BB && "Wrong BB"); |
| 223 | |
| 224 | unsigned Opcode = VL0->getOpcode(); |
| 225 | for (unsigned i = 0, e = VL.size(); i < e; ++i) { |
| 226 | Instruction *I = dyn_cast<Instruction>(VL[i]); |
| 227 | // If not all of the instructions are identical then we have to scalarize. |
| 228 | if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); |
| 229 | } |
| 230 | |
| 231 | // Check if it is safe to sink the loads or the stores. |
| 232 | if (Opcode == Instruction::Load || Opcode == Instruction::Store) { |
| 233 | int MaxIdx = InstrIdx[VL0]; |
| 234 | for (unsigned i = 1, e = VL.size(); i < e; ++i ) |
| 235 | MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); |
| 236 | |
| 237 | Instruction *Last = InstrVec[MaxIdx]; |
| 238 | for (unsigned i = 0, e = VL.size(); i < e; ++i ) { |
| 239 | if (VL[i] == Last) continue; |
| 240 | Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last); |
| 241 | if (Barrier) { |
| 242 | DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " << |
| 243 | *Last << "\n because of " << *Barrier << "\n"); |
| 244 | return max_cost; |
| 245 | } |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | switch (Opcode) { |
| 250 | case Instruction::Add: |
| 251 | case Instruction::FAdd: |
| 252 | case Instruction::Sub: |
| 253 | case Instruction::FSub: |
| 254 | case Instruction::Mul: |
| 255 | case Instruction::FMul: |
| 256 | case Instruction::UDiv: |
| 257 | case Instruction::SDiv: |
| 258 | case Instruction::FDiv: |
| 259 | case Instruction::URem: |
| 260 | case Instruction::SRem: |
| 261 | case Instruction::FRem: |
| 262 | case Instruction::Shl: |
| 263 | case Instruction::LShr: |
| 264 | case Instruction::AShr: |
| 265 | case Instruction::And: |
| 266 | case Instruction::Or: |
| 267 | case Instruction::Xor: { |
| 268 | ValueList Operands; |
| 269 | int Cost = 0; |
| 270 | // Calculate the cost of all of the operands. |
| 271 | for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { |
| 272 | // Prepare the operand vector. |
| 273 | for (unsigned j = 0; j < VL.size(); ++j) |
| 274 | Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); |
| 275 | Cost += getTreeRollCost(Operands, Depth+1); |
| 276 | Operands.clear(); |
| 277 | } |
| 278 | |
| 279 | // Calculate the cost of this instruction. |
| 280 | int ScalarCost = VecTy->getNumElements() * |
| 281 | TTI->getArithmeticInstrCost(Opcode, ScalarTy); |
| 282 | int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); |
| 283 | Cost += (VecCost - ScalarCost); |
| 284 | return Cost; |
| 285 | } |
| 286 | case Instruction::Load: { |
| 287 | // If we are scalarize the loads, add the cost of forming the vector. |
| 288 | for (unsigned i = 0, e = VL.size()-1; i < e; ++i) |
| 289 | if (!isConsecutiveAccess(VL[i], VL[i+1])) |
| 290 | return getScalarizationCost(VecTy); |
| 291 | |
| 292 | // Cost of wide load - cost of scalar loads. |
| 293 | int ScalarLdCost = VecTy->getNumElements() * |
| 294 | TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); |
| 295 | int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); |
| 296 | return VecLdCost - ScalarLdCost; |
| 297 | } |
| 298 | case Instruction::Store: { |
| 299 | // We know that we can merge the stores. Calculate the cost. |
| 300 | int ScalarStCost = VecTy->getNumElements() * |
| 301 | TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); |
| 302 | int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); |
| 303 | int StoreCost = VecStCost - ScalarStCost; |
| 304 | |
| 305 | ValueList Operands; |
| 306 | for (unsigned j = 0; j < VL.size(); ++j) { |
| 307 | Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); |
| 308 | MemBarrierIgnoreList.insert(VL[j]); |
| 309 | } |
| 310 | |
| 311 | int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1); |
| 312 | MemBarrierIgnoreList.clear(); |
| 313 | return TotalCost; |
| 314 | } |
| 315 | default: |
| 316 | // Unable to vectorize unknown instructions. |
| 317 | return getScalarizationCost(VecTy); |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) { |
| 322 | int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; |
| 323 | for (unsigned i = 0; i < VF; ++i ) |
| 324 | MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); |
| 325 | return InstrVec[MaxIdx + 1]; |
| 326 | } |
| 327 | |
| 328 | Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) { |
| 329 | IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements())); |
| 330 | Value *Vec = UndefValue::get(Ty); |
| 331 | for (unsigned i=0; i < Ty->getNumElements(); ++i) |
| 332 | Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); |
| 333 | return Vec; |
| 334 | } |
| 335 | |
| 336 | Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) { |
| 337 | Type *ScalarTy = VL[0]->getType(); |
| 338 | if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) |
| 339 | ScalarTy = SI->getValueOperand()->getType(); |
| 340 | VectorType *VecTy = VectorType::get(ScalarTy, VF); |
| 341 | |
| 342 | // Check if all of the operands are constants or identical. |
| 343 | bool AllConst = true; |
| 344 | bool AllSameScalar = true; |
| 345 | for (unsigned i = 0, e = VF; i < e; ++i) { |
| 346 | AllConst &= !!dyn_cast<Constant>(VL[i]); |
| 347 | AllSameScalar &= (VL[0] == VL[i]); |
| 348 | // Must have a single use. |
| 349 | Instruction *I = dyn_cast<Instruction>(VL[i]); |
| 350 | if (I && (I->getNumUses() > 1 || I->getParent() != BB)) |
| 351 | return Scalarize(VL, VecTy); |
| 352 | } |
| 353 | |
| 354 | // Is this a simple vector constant. |
| 355 | if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); |
| 356 | |
| 357 | // Scalarize unknown structures. |
| 358 | Instruction *VL0 = dyn_cast<Instruction>(VL[0]); |
| 359 | if (!VL0) return Scalarize(VL, VecTy); |
| 360 | |
| 361 | unsigned Opcode = VL0->getOpcode(); |
| 362 | for (unsigned i = 0, e = VF; i < e; ++i) { |
| 363 | Instruction *I = dyn_cast<Instruction>(VL[i]); |
| 364 | // If not all of the instructions are identical then we have to scalarize. |
| 365 | if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); |
| 366 | } |
| 367 | |
| 368 | switch (Opcode) { |
| 369 | case Instruction::Add: |
| 370 | case Instruction::FAdd: |
| 371 | case Instruction::Sub: |
| 372 | case Instruction::FSub: |
| 373 | case Instruction::Mul: |
| 374 | case Instruction::FMul: |
| 375 | case Instruction::UDiv: |
| 376 | case Instruction::SDiv: |
| 377 | case Instruction::FDiv: |
| 378 | case Instruction::URem: |
| 379 | case Instruction::SRem: |
| 380 | case Instruction::FRem: |
| 381 | case Instruction::Shl: |
| 382 | case Instruction::LShr: |
| 383 | case Instruction::AShr: |
| 384 | case Instruction::And: |
| 385 | case Instruction::Or: |
| 386 | case Instruction::Xor: { |
| 387 | ValueList LHSVL, RHSVL; |
| 388 | for (int i = 0; i < VF; ++i) { |
| 389 | RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0)); |
| 390 | LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1)); |
| 391 | } |
| 392 | |
| 393 | Value *RHS = vectorizeTree(RHSVL, VF); |
| 394 | Value *LHS = vectorizeTree(LHSVL, VF); |
| 395 | IRBuilder<> Builder(GetLastInstr(VL, VF)); |
| 396 | BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0); |
| 397 | return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS); |
| 398 | } |
| 399 | case Instruction::Load: { |
| 400 | LoadInst *LI = dyn_cast<LoadInst>(VL0); |
| 401 | unsigned Alignment = LI->getAlignment(); |
| 402 | |
| 403 | // Check if all of the loads are consecutive. |
| 404 | for (unsigned i = 1, e = VF; i < e; ++i) |
| 405 | if (!isConsecutiveAccess(VL[i-1], VL[i])) |
| 406 | return Scalarize(VL, VecTy); |
| 407 | |
| 408 | IRBuilder<> Builder(GetLastInstr(VL, VF)); |
| 409 | Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), |
| 410 | VecTy->getPointerTo()); |
| 411 | LI = Builder.CreateLoad(VecPtr); |
| 412 | LI->setAlignment(Alignment); |
| 413 | return LI; |
| 414 | } |
| 415 | case Instruction::Store: { |
| 416 | StoreInst *SI = dyn_cast<StoreInst>(VL0); |
| 417 | unsigned Alignment = SI->getAlignment(); |
| 418 | |
| 419 | ValueList ValueOp; |
| 420 | for (int i = 0; i < VF; ++i) |
| 421 | ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand()); |
| 422 | |
| 423 | Value *VecValue = vectorizeTree(ValueOp, VF); |
| 424 | |
| 425 | IRBuilder<> Builder(GetLastInstr(VL, VF)); |
| 426 | Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), |
| 427 | VecTy->getPointerTo()); |
| 428 | Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); |
| 429 | |
| 430 | for (int i = 0; i < VF; ++i) |
| 431 | cast<Instruction>(VL[i])->eraseFromParent(); |
| 432 | return 0; |
| 433 | } |
| 434 | default: |
| 435 | return Scalarize(VL, VecTy); |
| 436 | } |
| 437 | } |
| 438 | |
| 439 | } // end of namespace |