Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | /// \file |
| 10 | /// This file implements a TargetTransformInfo analysis pass specific to the |
| 11 | /// AArch64 target machine. It uses the target's detailed information to provide |
| 12 | /// more precise answers to certain TTI queries, while letting the target |
| 13 | /// independent and default TTI implementations handle the rest. |
| 14 | /// |
| 15 | //===----------------------------------------------------------------------===// |
| 16 | |
| 17 | #include "AArch64.h" |
| 18 | #include "AArch64TargetMachine.h" |
| 19 | #include "MCTargetDesc/AArch64AddressingModes.h" |
| 20 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 21 | #include "llvm/Support/Debug.h" |
| 22 | #include "llvm/Target/CostTable.h" |
| 23 | #include "llvm/Target/TargetLowering.h" |
| 24 | #include <algorithm> |
| 25 | using namespace llvm; |
| 26 | |
| 27 | #define DEBUG_TYPE "aarch64tti" |
| 28 | |
| 29 | // Declare the pass initialization routine locally as target-specific passes |
| 30 | // don't have a target-wide initialization entry point, and so we rely on the |
| 31 | // pass constructor initialization. |
| 32 | namespace llvm { |
| 33 | void initializeAArch64TTIPass(PassRegistry &); |
| 34 | } |
| 35 | |
| 36 | namespace { |
| 37 | |
| 38 | class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { |
| 39 | const AArch64TargetMachine *TM; |
| 40 | const AArch64Subtarget *ST; |
| 41 | const AArch64TargetLowering *TLI; |
| 42 | |
| 43 | /// Estimate the overhead of scalarizing an instruction. Insert and Extract |
| 44 | /// are set if the result needs to be inserted and/or extracted from vectors. |
| 45 | unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; |
| 46 | |
| 47 | public: |
| 48 | AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { |
| 49 | llvm_unreachable("This pass cannot be directly constructed"); |
| 50 | } |
| 51 | |
| 52 | AArch64TTI(const AArch64TargetMachine *TM) |
| 53 | : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), |
| 54 | TLI(TM->getTargetLowering()) { |
| 55 | initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); |
| 56 | } |
| 57 | |
| 58 | void initializePass() override { pushTTIStack(this); } |
| 59 | |
| 60 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 61 | TargetTransformInfo::getAnalysisUsage(AU); |
| 62 | } |
| 63 | |
| 64 | /// Pass identification. |
| 65 | static char ID; |
| 66 | |
| 67 | /// Provide necessary pointer adjustments for the two base classes. |
| 68 | void *getAdjustedAnalysisPointer(const void *ID) override { |
| 69 | if (ID == &TargetTransformInfo::ID) |
| 70 | return (TargetTransformInfo *)this; |
| 71 | return this; |
| 72 | } |
| 73 | |
| 74 | /// \name Scalar TTI Implementations |
| 75 | /// @{ |
| 76 | unsigned getIntImmCost(int64_t Val) const; |
| 77 | unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; |
| 78 | unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, |
| 79 | Type *Ty) const override; |
| 80 | unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, |
| 81 | Type *Ty) const override; |
| 82 | PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; |
| 83 | |
| 84 | /// @} |
| 85 | |
| 86 | /// \name Vector TTI Implementations |
| 87 | /// @{ |
| 88 | |
| 89 | unsigned getNumberOfRegisters(bool Vector) const override { |
| 90 | if (Vector) { |
| 91 | if (ST->hasNEON()) |
| 92 | return 32; |
| 93 | return 0; |
| 94 | } |
| 95 | return 31; |
| 96 | } |
| 97 | |
| 98 | unsigned getRegisterBitWidth(bool Vector) const override { |
| 99 | if (Vector) { |
| 100 | if (ST->hasNEON()) |
| 101 | return 128; |
| 102 | return 0; |
| 103 | } |
| 104 | return 64; |
| 105 | } |
| 106 | |
| 107 | unsigned getMaximumUnrollFactor() const override { return 2; } |
| 108 | |
| 109 | unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const |
| 110 | override; |
| 111 | |
| 112 | unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const |
| 113 | override; |
| 114 | |
| 115 | unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, |
| 116 | OperandValueKind Opd1Info = OK_AnyValue, |
| 117 | OperandValueKind Opd2Info = OK_AnyValue) const |
| 118 | override; |
| 119 | |
| 120 | unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; |
| 121 | |
| 122 | unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const |
| 123 | override; |
| 124 | |
| 125 | unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, |
| 126 | unsigned AddressSpace) const override; |
| 127 | /// @} |
| 128 | }; |
| 129 | |
| 130 | } // end anonymous namespace |
| 131 | |
| 132 | INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", |
| 133 | "AArch64 Target Transform Info", true, true, false) |
| 134 | char AArch64TTI::ID = 0; |
| 135 | |
| 136 | ImmutablePass * |
| 137 | llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { |
| 138 | return new AArch64TTI(TM); |
| 139 | } |
| 140 | |
| 141 | /// \brief Calculate the cost of materializing a 64-bit value. This helper |
| 142 | /// method might only calculate a fraction of a larger immediate. Therefore it |
| 143 | /// is valid to return a cost of ZERO. |
| 144 | unsigned AArch64TTI::getIntImmCost(int64_t Val) const { |
| 145 | // Check if the immediate can be encoded within an instruction. |
| 146 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) |
| 147 | return 0; |
| 148 | |
| 149 | if (Val < 0) |
| 150 | Val = ~Val; |
| 151 | |
| 152 | // Calculate how many moves we will need to materialize this constant. |
| 153 | unsigned LZ = countLeadingZeros((uint64_t)Val); |
| 154 | return (64 - LZ + 15) / 16; |
| 155 | } |
| 156 | |
| 157 | /// \brief Calculate the cost of materializing the given constant. |
| 158 | unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { |
| 159 | assert(Ty->isIntegerTy()); |
| 160 | |
| 161 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 162 | if (BitSize == 0) |
| 163 | return ~0U; |
| 164 | |
| 165 | // Sign-extend all constants to a multiple of 64-bit. |
| 166 | APInt ImmVal = Imm; |
| 167 | if (BitSize & 0x3f) |
| 168 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); |
| 169 | |
| 170 | // Split the constant into 64-bit chunks and calculate the cost for each |
| 171 | // chunk. |
| 172 | unsigned Cost = 0; |
| 173 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
| 174 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); |
| 175 | int64_t Val = Tmp.getSExtValue(); |
| 176 | Cost += getIntImmCost(Val); |
| 177 | } |
| 178 | // We need at least one instruction to materialze the constant. |
| 179 | return std::max(1U, Cost); |
| 180 | } |
| 181 | |
| 182 | unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, |
| 183 | const APInt &Imm, Type *Ty) const { |
| 184 | assert(Ty->isIntegerTy()); |
| 185 | |
| 186 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 187 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 188 | // here, so that constant hoisting will ignore this constant. |
| 189 | if (BitSize == 0) |
| 190 | return TCC_Free; |
| 191 | |
| 192 | unsigned ImmIdx = ~0U; |
| 193 | switch (Opcode) { |
| 194 | default: |
| 195 | return TCC_Free; |
| 196 | case Instruction::GetElementPtr: |
| 197 | // Always hoist the base address of a GetElementPtr. |
| 198 | if (Idx == 0) |
| 199 | return 2 * TCC_Basic; |
| 200 | return TCC_Free; |
| 201 | case Instruction::Store: |
| 202 | ImmIdx = 0; |
| 203 | break; |
| 204 | case Instruction::Add: |
| 205 | case Instruction::Sub: |
| 206 | case Instruction::Mul: |
| 207 | case Instruction::UDiv: |
| 208 | case Instruction::SDiv: |
| 209 | case Instruction::URem: |
| 210 | case Instruction::SRem: |
| 211 | case Instruction::And: |
| 212 | case Instruction::Or: |
| 213 | case Instruction::Xor: |
| 214 | case Instruction::ICmp: |
| 215 | ImmIdx = 1; |
| 216 | break; |
| 217 | // Always return TCC_Free for the shift value of a shift instruction. |
| 218 | case Instruction::Shl: |
| 219 | case Instruction::LShr: |
| 220 | case Instruction::AShr: |
| 221 | if (Idx == 1) |
| 222 | return TCC_Free; |
| 223 | break; |
| 224 | case Instruction::Trunc: |
| 225 | case Instruction::ZExt: |
| 226 | case Instruction::SExt: |
| 227 | case Instruction::IntToPtr: |
| 228 | case Instruction::PtrToInt: |
| 229 | case Instruction::BitCast: |
| 230 | case Instruction::PHI: |
| 231 | case Instruction::Call: |
| 232 | case Instruction::Select: |
| 233 | case Instruction::Ret: |
| 234 | case Instruction::Load: |
| 235 | break; |
| 236 | } |
| 237 | |
| 238 | if (Idx == ImmIdx) { |
| 239 | unsigned NumConstants = (BitSize + 63) / 64; |
| 240 | unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); |
| 241 | return (Cost <= NumConstants * TCC_Basic) |
| 242 | ? static_cast<unsigned>(TCC_Free) : Cost; |
| 243 | } |
| 244 | return AArch64TTI::getIntImmCost(Imm, Ty); |
| 245 | } |
| 246 | |
| 247 | unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, |
| 248 | const APInt &Imm, Type *Ty) const { |
| 249 | assert(Ty->isIntegerTy()); |
| 250 | |
| 251 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 252 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 253 | // here, so that constant hoisting will ignore this constant. |
| 254 | if (BitSize == 0) |
| 255 | return TCC_Free; |
| 256 | |
| 257 | switch (IID) { |
| 258 | default: |
| 259 | return TCC_Free; |
| 260 | case Intrinsic::sadd_with_overflow: |
| 261 | case Intrinsic::uadd_with_overflow: |
| 262 | case Intrinsic::ssub_with_overflow: |
| 263 | case Intrinsic::usub_with_overflow: |
| 264 | case Intrinsic::smul_with_overflow: |
| 265 | case Intrinsic::umul_with_overflow: |
| 266 | if (Idx == 1) { |
| 267 | unsigned NumConstants = (BitSize + 63) / 64; |
| 268 | unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); |
| 269 | return (Cost <= NumConstants * TCC_Basic) |
| 270 | ? static_cast<unsigned>(TCC_Free) : Cost; |
| 271 | } |
| 272 | break; |
| 273 | case Intrinsic::experimental_stackmap: |
| 274 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
| 275 | return TCC_Free; |
| 276 | break; |
| 277 | case Intrinsic::experimental_patchpoint_void: |
| 278 | case Intrinsic::experimental_patchpoint_i64: |
| 279 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) |
| 280 | return TCC_Free; |
| 281 | break; |
| 282 | } |
| 283 | return AArch64TTI::getIntImmCost(Imm, Ty); |
| 284 | } |
| 285 | |
| 286 | AArch64TTI::PopcntSupportKind |
| 287 | AArch64TTI::getPopcntSupport(unsigned TyWidth) const { |
| 288 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
| 289 | if (TyWidth == 32 || TyWidth == 64) |
| 290 | return PSK_FastHardware; |
| 291 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
| 292 | return PSK_Software; |
| 293 | } |
| 294 | |
| 295 | unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, |
| 296 | Type *Src) const { |
| 297 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 298 | assert(ISD && "Invalid opcode"); |
| 299 | |
| 300 | EVT SrcTy = TLI->getValueType(Src); |
| 301 | EVT DstTy = TLI->getValueType(Dst); |
| 302 | |
| 303 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
| 304 | return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); |
| 305 | |
| 306 | static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { |
| 307 | // LowerVectorINT_TO_FP: |
| 308 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 309 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 310 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
| 311 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 312 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 313 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 314 | |
| 315 | // Complex: to v2f32 |
| 316 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
| 317 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
| 318 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
| 319 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
| 320 | |
| 321 | // Complex: to v4f32 |
| 322 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, |
| 323 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
| 324 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, |
| 325 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
| 326 | |
| 327 | // Complex: to v2f64 |
| 328 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
| 329 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
| 330 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
| 331 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
| 332 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
| 333 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
| 334 | |
| 335 | |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 336 | // LowerVectorFP_TO_INT |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 337 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 338 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, |
| 339 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 340 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 341 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
| 342 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
Tim Northover | ef0d760 | 2014-06-15 09:27:06 +0000 | [diff] [blame^] | 343 | |
| 344 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, |
| 345 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, |
| 346 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, |
| 347 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, |
| 348 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, |
| 349 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, |
Tim Northover | 3b0846e | 2014-05-24 12:50:23 +0000 | [diff] [blame] | 350 | }; |
| 351 | |
| 352 | int Idx = ConvertCostTableLookup<MVT>( |
| 353 | ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), |
| 354 | SrcTy.getSimpleVT()); |
| 355 | if (Idx != -1) |
| 356 | return ConversionTbl[Idx].Cost; |
| 357 | |
| 358 | return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); |
| 359 | } |
| 360 | |
| 361 | unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, |
| 362 | unsigned Index) const { |
| 363 | assert(Val->isVectorTy() && "This must be a vector type"); |
| 364 | |
| 365 | if (Index != -1U) { |
| 366 | // Legalize the type. |
| 367 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); |
| 368 | |
| 369 | // This type is legalized to a scalar type. |
| 370 | if (!LT.second.isVector()) |
| 371 | return 0; |
| 372 | |
| 373 | // The type may be split. Normalize the index to the new type. |
| 374 | unsigned Width = LT.second.getVectorNumElements(); |
| 375 | Index = Index % Width; |
| 376 | |
| 377 | // The element at index zero is already inside the vector. |
| 378 | if (Index == 0) |
| 379 | return 0; |
| 380 | } |
| 381 | |
| 382 | // All other insert/extracts cost this much. |
| 383 | return 2; |
| 384 | } |
| 385 | |
| 386 | unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, |
| 387 | OperandValueKind Opd1Info, |
| 388 | OperandValueKind Opd2Info) const { |
| 389 | // Legalize the type. |
| 390 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); |
| 391 | |
| 392 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 393 | |
| 394 | switch (ISD) { |
| 395 | default: |
| 396 | return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, |
| 397 | Opd2Info); |
| 398 | case ISD::ADD: |
| 399 | case ISD::MUL: |
| 400 | case ISD::XOR: |
| 401 | case ISD::OR: |
| 402 | case ISD::AND: |
| 403 | // These nodes are marked as 'custom' for combining purposes only. |
| 404 | // We know that they are legal. See LowerAdd in ISelLowering. |
| 405 | return 1 * LT.first; |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { |
| 410 | // Address computations in vectorized code with non-consecutive addresses will |
| 411 | // likely result in more instructions compared to scalar code where the |
| 412 | // computation can more often be merged into the index mode. The resulting |
| 413 | // extra micro-ops can significantly decrease throughput. |
| 414 | unsigned NumVectorInstToHideOverhead = 10; |
| 415 | |
| 416 | if (Ty->isVectorTy() && IsComplex) |
| 417 | return NumVectorInstToHideOverhead; |
| 418 | |
| 419 | // In many cases the address computation is not merged into the instruction |
| 420 | // addressing mode. |
| 421 | return 1; |
| 422 | } |
| 423 | |
| 424 | unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
| 425 | Type *CondTy) const { |
| 426 | |
| 427 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 428 | // We don't lower vector selects well that are wider than the register width. |
| 429 | if (ValTy->isVectorTy() && ISD == ISD::SELECT) { |
| 430 | // We would need this many instructions to hide the scalarization happening. |
| 431 | unsigned AmortizationCost = 20; |
| 432 | static const TypeConversionCostTblEntry<MVT::SimpleValueType> |
| 433 | VectorSelectTbl[] = { |
| 434 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, |
| 435 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, |
| 436 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, |
| 437 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, |
| 438 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, |
| 439 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } |
| 440 | }; |
| 441 | |
| 442 | EVT SelCondTy = TLI->getValueType(CondTy); |
| 443 | EVT SelValTy = TLI->getValueType(ValTy); |
| 444 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
| 445 | int Idx = |
| 446 | ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), |
| 447 | SelValTy.getSimpleVT()); |
| 448 | if (Idx != -1) |
| 449 | return VectorSelectTbl[Idx].Cost; |
| 450 | } |
| 451 | } |
| 452 | return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); |
| 453 | } |
| 454 | |
| 455 | unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, |
| 456 | unsigned Alignment, |
| 457 | unsigned AddressSpace) const { |
| 458 | std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); |
| 459 | |
| 460 | if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && |
| 461 | Src->getVectorElementType()->isIntegerTy(64)) { |
| 462 | // Unaligned stores are extremely inefficient. We don't split |
| 463 | // unaligned v2i64 stores because the negative impact that has shown in |
| 464 | // practice on inlined memcpy code. |
| 465 | // We make v2i64 stores expensive so that we will only vectorize if there |
| 466 | // are 6 other instructions getting vectorized. |
| 467 | unsigned AmortizationCost = 6; |
| 468 | |
| 469 | return LT.first * 2 * AmortizationCost; |
| 470 | } |
| 471 | |
| 472 | if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && |
| 473 | Src->getVectorNumElements() < 8) { |
| 474 | // We scalarize the loads/stores because there is not v.4b register and we |
| 475 | // have to promote the elements to v.4h. |
| 476 | unsigned NumVecElts = Src->getVectorNumElements(); |
| 477 | unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; |
| 478 | // We generate 2 instructions per vector element. |
| 479 | return NumVectorizableInstsToAmortize * NumVecElts * 2; |
| 480 | } |
| 481 | |
| 482 | return LT.first; |
| 483 | } |