Blame - llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp - toolchain/llvm-project

blob: a2b2894370837d78251d7d0a66b65f13bdd9f0ba [file] [log] [blame]

Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	1	//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements a TargetTransformInfo analysis pass specific to the
				11	// SystemZ target machine. It uses the target's detailed information to provide
				12	// more precise answers to certain TTI queries, while letting the target
				13	// independent and default TTI implementations handle the rest.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17	#include "SystemZTargetTransformInfo.h"
				18	#include "llvm/Analysis/TargetTransformInfo.h"
				19	#include "llvm/CodeGen/BasicTTIImpl.h"
David Blaikie	b3bde2e	2017-11-17 01:07:10 +0000	[diff] [blame]	20	#include "llvm/CodeGen/CostTable.h"
				21	#include "llvm/CodeGen/TargetLowering.h"
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	22	#include "llvm/IR/IntrinsicInst.h"
				23	#include "llvm/Support/Debug.h"
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	24	using namespace llvm;
				25
				26	#define DEBUG_TYPE "systemztti"
				27
				28	//===----------------------------------------------------------------------===//
				29	//
				30	// SystemZ cost model.
				31	//
				32	//===----------------------------------------------------------------------===//
				33
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	34	int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	35	assert(Ty->isIntegerTy());
				36
				37	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				38	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				39	// here, so that constant hoisting will ignore this constant.
				40	if (BitSize == 0)
				41	return TTI::TCC_Free;
				42	// No cost model for operations on integers larger than 64 bit implemented yet.
				43	if (BitSize > 64)
				44	return TTI::TCC_Free;
				45
				46	if (Imm == 0)
				47	return TTI::TCC_Free;
				48
				49	if (Imm.getBitWidth() <= 64) {
				50	// Constants loaded via lgfi.
				51	if (isInt<32>(Imm.getSExtValue()))
				52	return TTI::TCC_Basic;
				53	// Constants loaded via llilf.
				54	if (isUInt<32>(Imm.getZExtValue()))
				55	return TTI::TCC_Basic;
				56	// Constants loaded via llihf:
				57	if ((Imm.getZExtValue() & 0xffffffff) == 0)
				58	return TTI::TCC_Basic;
				59
				60	return 2 * TTI::TCC_Basic;
				61	}
				62
				63	return 4 * TTI::TCC_Basic;
				64	}
				65
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	66	int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
				67	const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	68	assert(Ty->isIntegerTy());
				69
				70	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				71	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				72	// here, so that constant hoisting will ignore this constant.
				73	if (BitSize == 0)
				74	return TTI::TCC_Free;
				75	// No cost model for operations on integers larger than 64 bit implemented yet.
				76	if (BitSize > 64)
				77	return TTI::TCC_Free;
				78
				79	switch (Opcode) {
				80	default:
				81	return TTI::TCC_Free;
				82	case Instruction::GetElementPtr:
				83	// Always hoist the base address of a GetElementPtr. This prevents the
				84	// creation of new constants for every base constant that gets constant
				85	// folded with the offset.
				86	if (Idx == 0)
				87	return 2 * TTI::TCC_Basic;
				88	return TTI::TCC_Free;
				89	case Instruction::Store:
				90	if (Idx == 0 && Imm.getBitWidth() <= 64) {
				91	// Any 8-bit immediate store can by implemented via mvi.
				92	if (BitSize == 8)
				93	return TTI::TCC_Free;
				94	// 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
				95	if (isInt<16>(Imm.getSExtValue()))
				96	return TTI::TCC_Free;
				97	}
				98	break;
				99	case Instruction::ICmp:
				100	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				101	// Comparisons against signed 32-bit immediates implemented via cgfi.
				102	if (isInt<32>(Imm.getSExtValue()))
				103	return TTI::TCC_Free;
				104	// Comparisons against unsigned 32-bit immediates implemented via clgfi.
				105	if (isUInt<32>(Imm.getZExtValue()))
				106	return TTI::TCC_Free;
				107	}
				108	break;
				109	case Instruction::Add:
				110	case Instruction::Sub:
				111	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				112	// We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
				113	if (isUInt<32>(Imm.getZExtValue()))
				114	return TTI::TCC_Free;
				115	// Or their negation, by swapping addition vs. subtraction.
				116	if (isUInt<32>(-Imm.getSExtValue()))
				117	return TTI::TCC_Free;
				118	}
				119	break;
				120	case Instruction::Mul:
				121	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				122	// We use msgfi to multiply by 32-bit signed immediates.
				123	if (isInt<32>(Imm.getSExtValue()))
				124	return TTI::TCC_Free;
				125	}
				126	break;
				127	case Instruction::Or:
				128	case Instruction::Xor:
				129	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				130	// Masks supported by oilf/xilf.
				131	if (isUInt<32>(Imm.getZExtValue()))
				132	return TTI::TCC_Free;
				133	// Masks supported by oihf/xihf.
				134	if ((Imm.getZExtValue() & 0xffffffff) == 0)
				135	return TTI::TCC_Free;
				136	}
				137	break;
				138	case Instruction::And:
				139	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				140	// Any 32-bit AND operation can by implemented via nilf.
				141	if (BitSize <= 32)
				142	return TTI::TCC_Free;
				143	// 64-bit masks supported by nilf.
				144	if (isUInt<32>(~Imm.getZExtValue()))
				145	return TTI::TCC_Free;
				146	// 64-bit masks supported by nilh.
				147	if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
				148	return TTI::TCC_Free;
				149	// Some 64-bit AND operations can be implemented via risbg.
				150	const SystemZInstrInfo *TII = ST->getInstrInfo();
				151	unsigned Start, End;
				152	if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
				153	return TTI::TCC_Free;
				154	}
				155	break;
				156	case Instruction::Shl:
				157	case Instruction::LShr:
				158	case Instruction::AShr:
				159	// Always return TCC_Free for the shift value of a shift instruction.
				160	if (Idx == 1)
				161	return TTI::TCC_Free;
				162	break;
				163	case Instruction::UDiv:
				164	case Instruction::SDiv:
				165	case Instruction::URem:
				166	case Instruction::SRem:
				167	case Instruction::Trunc:
				168	case Instruction::ZExt:
				169	case Instruction::SExt:
				170	case Instruction::IntToPtr:
				171	case Instruction::PtrToInt:
				172	case Instruction::BitCast:
				173	case Instruction::PHI:
				174	case Instruction::Call:
				175	case Instruction::Select:
				176	case Instruction::Ret:
				177	case Instruction::Load:
				178	break;
				179	}
				180
				181	return SystemZTTIImpl::getIntImmCost(Imm, Ty);
				182	}
				183
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	184	int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
				185	const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	186	assert(Ty->isIntegerTy());
				187
				188	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				189	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				190	// here, so that constant hoisting will ignore this constant.
				191	if (BitSize == 0)
				192	return TTI::TCC_Free;
				193	// No cost model for operations on integers larger than 64 bit implemented yet.
				194	if (BitSize > 64)
				195	return TTI::TCC_Free;
				196
				197	switch (IID) {
				198	default:
				199	return TTI::TCC_Free;
				200	case Intrinsic::sadd_with_overflow:
				201	case Intrinsic::uadd_with_overflow:
				202	case Intrinsic::ssub_with_overflow:
				203	case Intrinsic::usub_with_overflow:
				204	// These get expanded to include a normal addition/subtraction.
				205	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				206	if (isUInt<32>(Imm.getZExtValue()))
				207	return TTI::TCC_Free;
				208	if (isUInt<32>(-Imm.getSExtValue()))
				209	return TTI::TCC_Free;
				210	}
				211	break;
				212	case Intrinsic::smul_with_overflow:
				213	case Intrinsic::umul_with_overflow:
				214	// These get expanded to include a normal multiplication.
				215	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				216	if (isInt<32>(Imm.getSExtValue()))
				217	return TTI::TCC_Free;
				218	}
				219	break;
				220	case Intrinsic::experimental_stackmap:
				221	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				222	return TTI::TCC_Free;
				223	break;
				224	case Intrinsic::experimental_patchpoint_void:
				225	case Intrinsic::experimental_patchpoint_i64:
				226	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				227	return TTI::TCC_Free;
				228	break;
				229	}
				230	return SystemZTTIImpl::getIntImmCost(Imm, Ty);
				231	}
Ulrich Weigand	b401218	2015-03-31 12:56:33 +0000	[diff] [blame]	232
				233	TargetTransformInfo::PopcntSupportKind
				234	SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
				235	assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
				236	if (ST->hasPopulationCount() && TyWidth <= 64)
				237	return TTI::PSK_FastHardware;
				238	return TTI::PSK_Software;
				239	}
				240
Geoff Berry	66d9bdb	2017-06-28 15:53:17 +0000	[diff] [blame]	241	void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	242	TTI::UnrollingPreferences &UP) {
				243	// Find out if L contains a call, what the machine instruction count
				244	// estimate is, and how many stores there are.
				245	bool HasCall = false;
				246	unsigned NumStores = 0;
				247	for (auto &BB : L->blocks())
				248	for (auto &I : *BB) {
				249	if (isa<CallInst>(&I) \|\| isa<InvokeInst>(&I)) {
				250	ImmutableCallSite CS(&I);
				251	if (const Function *F = CS.getCalledFunction()) {
				252	if (isLoweredToCall(F))
				253	HasCall = true;
				254	if (F->getIntrinsicID() == Intrinsic::memcpy \|\|
				255	F->getIntrinsicID() == Intrinsic::memset)
				256	NumStores++;
				257	} else { // indirect call.
				258	HasCall = true;
				259	}
				260	}
				261	if (isa<StoreInst>(&I)) {
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	262	Type *MemAccessTy = I.getOperand(0)->getType();
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	263	NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	264	}
				265	}
				266
				267	// The z13 processor will run out of store tags if too many stores
				268	// are fed into it too quickly. Therefore make sure there are not
				269	// too many stores in the resulting unrolled loop.
				270	unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
				271
				272	if (HasCall) {
				273	// Only allow full unrolling if loop has any calls.
				274	UP.FullUnrollMaxCount = Max;
				275	UP.MaxCount = 1;
				276	return;
				277	}
				278
				279	UP.MaxCount = Max;
				280	if (UP.MaxCount <= 1)
				281	return;
				282
				283	// Allow partial and runtime trip count unrolling.
				284	UP.Partial = UP.Runtime = true;
				285
				286	UP.PartialThreshold = 75;
				287	UP.DefaultUnrollRuntimeCount = 4;
				288
				289	// Allow expensive instructions in the pre-header of the loop.
				290	UP.AllowExpensiveTripCount = true;
				291
				292	UP.Force = true;
				293	}
				294
Jonas Paulsson	024e319	2017-07-21 11:59:37 +0000	[diff] [blame]	295
				296	bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
				297	TargetTransformInfo::LSRCost &C2) {
				298	// SystemZ specific: check instruction count (first), and don't care about
				299	// ImmCost, since offsets are checked explicitly.
				300	return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
				301	C1.NumIVMuls, C1.NumBaseAdds,
				302	C1.ScaleCost, C1.SetupCost) <
				303	std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
				304	C2.NumIVMuls, C2.NumBaseAdds,
				305	C2.ScaleCost, C2.SetupCost);
				306	}
				307
Ulrich Weigand	ce4c109	2015-05-05 19:25:42 +0000	[diff] [blame]	308	unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
				309	if (!Vector)
				310	// Discount the stack pointer. Also leave out %r0, since it can't
				311	// be used in an address.
				312	return 14;
				313	if (ST->hasVector())
				314	return 32;
				315	return 0;
				316	}
				317
Daniel Neilson	c0112ae	2017-06-12 14:22:21 +0000	[diff] [blame]	318	unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
Ulrich Weigand	ce4c109	2015-05-05 19:25:42 +0000	[diff] [blame]	319	if (!Vector)
				320	return 64;
				321	if (ST->hasVector())
				322	return 128;
				323	return 0;
				324	}
				325
Jonas Paulsson	e54cc1a	2017-11-06 13:10:31 +0000	[diff] [blame]	326	bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
				327	EVT VT = TLI->getValueType(DL, DataType);
				328	return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
				329	}
				330
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	331	// Return the bit size for the scalar type or vector element
				332	// type. getScalarSizeInBits() returns 0 for a pointer type.
				333	static unsigned getScalarSizeInBits(Type *Ty) {
				334	unsigned Size =
				335	(Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
				336	assert(Size > 0 && "Element must have non-zero size.");
				337	return Size;
				338	}
				339
				340	// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
				341	// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
				342	// 3.
				343	static unsigned getNumVectorRegs(Type *Ty) {
				344	assert(Ty->isVectorTy() && "Expected vector type");
				345	unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
				346	assert(WideBits > 0 && "Could not compute size of vector");
				347	return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
				348	}
				349
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	350	int SystemZTTIImpl::getArithmeticInstrCost(
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	351	unsigned Opcode, Type *Ty,
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	352	TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
				353	TTI::OperandValueProperties Opd1PropInfo,
				354	TTI::OperandValueProperties Opd2PropInfo,
				355	ArrayRef<const Value *> Args) {
				356
				357	// TODO: return a good value for BB-VECTORIZER that includes the
				358	// immediate loads, which we do not want to count for the loop
				359	// vectorizer, since they are hopefully hoisted out of the loop. This
				360	// would require a new parameter 'InLoop', but not sure if constant
				361	// args are common enough to motivate this.
				362
				363	unsigned ScalarBits = Ty->getScalarSizeInBits();
				364
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	365	// There are thre cases of division and remainder: Dividing with a register
				366	// needs a divide instruction. A divisor which is a power of two constant
				367	// can be implemented with a sequence of shifts. Any other constant needs a
				368	// multiply and shifts.
				369	const unsigned DivInstrCost = 20;
				370	const unsigned DivMulSeqCost = 10;
				371	const unsigned SDivPow2Cost = 4;
				372
				373	bool SignedDivRem =
				374	Opcode == Instruction::SDiv \|\| Opcode == Instruction::SRem;
				375	bool UnsignedDivRem =
				376	Opcode == Instruction::UDiv \|\| Opcode == Instruction::URem;
				377
				378	// Check for a constant divisor.
				379	bool DivRemConst = false;
				380	bool DivRemConstPow2 = false;
				381	if ((SignedDivRem \|\| UnsignedDivRem) && Args.size() == 2) {
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	382	if (const Constant *C = dyn_cast<Constant>(Args[1])) {
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	383	const ConstantInt *CVal =
				384	(C->getType()->isVectorTy()
				385	? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
				386	: dyn_cast<const ConstantInt>(C));
				387	if (CVal != nullptr &&
				388	(CVal->getValue().isPowerOf2() \|\| (-CVal->getValue()).isPowerOf2()))
				389	DivRemConstPow2 = true;
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	390	else
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	391	DivRemConst = true;
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	392	}
				393	}
				394
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	395	if (Ty->isVectorTy()) {
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	396	assert(ST->hasVector() &&
				397	"getArithmeticInstrCost() called with vector type.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	398	unsigned VF = Ty->getVectorNumElements();
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	399	unsigned NumVectors = getNumVectorRegs(Ty);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	400
				401	// These vector operations are custom handled, but are still supported
				402	// with one instruction per vector, regardless of element size.
				403	if (Opcode == Instruction::Shl \|\| Opcode == Instruction::LShr \|\|
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	404	Opcode == Instruction::AShr) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	405	return NumVectors;
				406	}
				407
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	408	if (DivRemConstPow2)
				409	return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
				410	if (DivRemConst)
				411	return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
				412	if ((SignedDivRem \|\| UnsignedDivRem) && VF > 4)
				413	// Temporary hack: disable high vectorization factors with integer
				414	// division/remainder, which will get scalarized and handled with
				415	// GR128 registers. The mischeduler is not clever enough to avoid
				416	// spilling yet.
Jonas Paulsson	bf66f38	2018-10-10 09:30:29 +0000	[diff] [blame]	417	return 1000;
				418
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	419	// These FP operations are supported with a single vector instruction for
				420	// double (base implementation assumes float generally costs 2). For
				421	// FP128, the scalar cost is 1, and there is no overhead since the values
				422	// are already in scalar registers.
				423	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
				424	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) {
				425	switch (ScalarBits) {
				426	case 32: {
Ulrich Weigand	33435c4	2017-07-17 17:42:48 +0000	[diff] [blame]	427	// The vector enhancements facility 1 provides v4f32 instructions.
				428	if (ST->hasVectorEnhancements1())
				429	return NumVectors;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	430	// Return the cost of multiple scalar invocation plus the cost of
				431	// inserting and extracting the values.
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	432	unsigned ScalarCost =
				433	getArithmeticInstrCost(Opcode, Ty->getScalarType());
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	434	unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
				435	// FIXME: VF 2 for these FP operations are currently just as
				436	// expensive as for VF 4.
				437	if (VF == 2)
				438	Cost *= 2;
				439	return Cost;
				440	}
				441	case 64:
				442	case 128:
				443	return NumVectors;
				444	default:
				445	break;
				446	}
				447	}
				448
				449	// There is no native support for FRem.
				450	if (Opcode == Instruction::FRem) {
				451	unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
				452	// FIXME: VF 2 for float is currently just as expensive as for VF 4.
				453	if (VF == 2 && ScalarBits == 32)
				454	Cost *= 2;
				455	return Cost;
				456	}
				457	}
				458	else { // Scalar:
				459	// These FP operations are supported with a dedicated instruction for
				460	// float, double and fp128 (base implementation assumes float generally
				461	// costs 2).
				462	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
				463	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv)
				464	return 1;
				465
				466	// There is no native support for FRem.
				467	if (Opcode == Instruction::FRem)
				468	return LIBCALL_COST;
				469
				470	if (Opcode == Instruction::LShr \|\| Opcode == Instruction::AShr)
				471	return (ScalarBits >= 32 ? 1 : 2 /ext/);
				472
				473	// Or requires one instruction, although it has custom handling for i64.
				474	if (Opcode == Instruction::Or)
				475	return 1;
				476
Jonas Paulsson	77df2f2	2018-09-14 06:46:55 +0000	[diff] [blame]	477	if (Opcode == Instruction::Xor && ScalarBits == 1) {
				478	if (ST->hasLoadStoreOnCond2())
				479	return 5; // 2 * (li 0; loc 1); xor
				480	return 7; // 2 * ipm sequences ; xor ; shift ; compare
				481	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	482
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	483	if (DivRemConstPow2)
				484	return (SignedDivRem ? SDivPow2Cost : 1);
				485	if (DivRemConst)
				486	return DivMulSeqCost;
				487	if (SignedDivRem)
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	488	// sext of op(s) for narrow types
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	489	return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
				490	if (UnsignedDivRem)
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	491	// Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	492	return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	493	}
				494
				495	// Fallback to the default implementation.
				496	return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
				497	Opd1PropInfo, Opd2PropInfo, Args);
				498	}
				499
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	500	int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
				501	Type *SubTp) {
				502	assert (Tp->isVectorTy());
				503	assert (ST->hasVector() && "getShuffleCost() called.");
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	504	unsigned NumVectors = getNumVectorRegs(Tp);
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	505
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	506	// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
				507
				508	// FP128 values are always in scalar registers, so there is no work
				509	// involved with a shuffle, except for broadcast. In that case register
				510	// moves are done with a single instruction per element.
				511	if (Tp->getScalarType()->isFP128Ty())
				512	return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
				513
				514	switch (Kind) {
				515	case TargetTransformInfo::SK_ExtractSubvector:
				516	// ExtractSubvector Index indicates start offset.
				517
				518	// Extracting a subvector from first index is a noop.
				519	return (Index == 0 ? 0 : NumVectors);
				520
				521	case TargetTransformInfo::SK_Broadcast:
				522	// Loop vectorizer calls here to figure out the extra cost of
				523	// broadcasting a loaded value to all elements of a vector. Since vlrep
				524	// loads and replicates with a single instruction, adjust the returned
				525	// value.
				526	return NumVectors - 1;
				527
				528	default:
				529
				530	// SystemZ supports single instruction permutation / replication.
				531	return NumVectors;
				532	}
				533
				534	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
				535	}
				536
				537	// Return the log2 difference of the element sizes of the two vector types.
				538	static unsigned getElSizeLog2Diff(Type Ty0, Type Ty1) {
				539	unsigned Bits0 = Ty0->getScalarSizeInBits();
				540	unsigned Bits1 = Ty1->getScalarSizeInBits();
				541
				542	if (Bits1 > Bits0)
				543	return (Log2_32(Bits1) - Log2_32(Bits0));
				544
				545	return (Log2_32(Bits0) - Log2_32(Bits1));
				546	}
				547
				548	// Return the number of instructions needed to truncate SrcTy to DstTy.
				549	unsigned SystemZTTIImpl::
				550	getVectorTruncCost(Type SrcTy, Type DstTy) {
				551	assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
				552	assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
				553	"Packing must reduce size of vector type.");
				554	assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
				555	"Packing should not change number of elements.");
				556
				557	// TODO: Since fp32 is expanded, the extract cost should always be 0.
				558
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	559	unsigned NumParts = getNumVectorRegs(SrcTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	560	if (NumParts <= 2)
				561	// Up to 2 vector registers can be truncated efficiently with pack or
				562	// permute. The latter requires an immediate mask to be loaded, which
				563	// typically gets hoisted out of a loop. TODO: return a good value for
				564	// BB-VECTORIZER that includes the immediate loads, which we do not want
				565	// to count for the loop vectorizer.
				566	return 1;
				567
				568	unsigned Cost = 0;
				569	unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
				570	unsigned VF = SrcTy->getVectorNumElements();
				571	for (unsigned P = 0; P < Log2Diff; ++P) {
				572	if (NumParts > 1)
				573	NumParts /= 2;
				574	Cost += NumParts;
				575	}
				576
				577	// Currently, a general mix of permutes and pack instructions is output by
				578	// isel, which follow the cost computation above except for this case which
				579	// is one instruction less:
				580	if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
				581	DstTy->getScalarSizeInBits() == 8)
				582	Cost--;
				583
				584	return Cost;
				585	}
				586
				587	// Return the cost of converting a vector bitmask produced by a compare
				588	// (SrcTy), to the type of the select or extend instruction (DstTy).
				589	unsigned SystemZTTIImpl::
				590	getVectorBitmaskConversionCost(Type SrcTy, Type DstTy) {
				591	assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
				592	"Should only be called with vector types.");
				593
				594	unsigned PackCost = 0;
				595	unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
				596	unsigned DstScalarBits = DstTy->getScalarSizeInBits();
				597	unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
				598	if (SrcScalarBits > DstScalarBits)
				599	// The bitmask will be truncated.
				600	PackCost = getVectorTruncCost(SrcTy, DstTy);
				601	else if (SrcScalarBits < DstScalarBits) {
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	602	unsigned DstNumParts = getNumVectorRegs(DstTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	603	// Each vector select needs its part of the bitmask unpacked.
				604	PackCost = Log2Diff * DstNumParts;
				605	// Extra cost for moving part of mask before unpacking.
				606	PackCost += DstNumParts - 1;
				607	}
				608
				609	return PackCost;
				610	}
				611
				612	// Return the type of the compared operands. This is needed to compute the
				613	// cost for a Select / ZExt or SExt instruction.
				614	static Type getCmpOpsType(const Instruction I, unsigned VF = 1) {
				615	Type *OpTy = nullptr;
				616	if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
				617	OpTy = CI->getOperand(0)->getType();
				618	else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
Jonas Paulsson	f40eac5	2017-05-03 13:33:45 +0000	[diff] [blame]	619	if (LogicI->getNumOperands() == 2)
				620	if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
				621	if (isa<CmpInst>(LogicI->getOperand(1)))
				622	OpTy = CI0->getOperand(0)->getType();
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	623
				624	if (OpTy != nullptr) {
				625	if (VF == 1) {
				626	assert (!OpTy->isVectorTy() && "Expected scalar type");
				627	return OpTy;
				628	}
				629	// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
				630	// be either scalar or already vectorized with a same or lesser VF.
				631	Type *ElTy = OpTy->getScalarType();
				632	return VectorType::get(ElTy, VF);
				633	}
				634
				635	return nullptr;
				636	}
				637
Jonas Paulsson	f15a53b	2018-11-01 09:01:51 +0000	[diff] [blame]	638	// Get the cost of converting a boolean vector to a vector with same width
				639	// and element size as Dst, plus the cost of zero extending if needed.
				640	unsigned SystemZTTIImpl::
				641	getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
				642	const Instruction *I) {
				643	assert (Dst->isVectorTy());
				644	unsigned VF = Dst->getVectorNumElements();
				645	unsigned Cost = 0;
				646	// If we know what the widths of the compared operands, get any cost of
				647	// converting it to match Dst. Otherwise assume same widths.
				648	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
				649	if (CmpOpTy != nullptr)
				650	Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
				651	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::UIToFP)
				652	// One 'vn' per dst vector with an immediate mask.
				653	Cost += getNumVectorRegs(Dst);
				654	return Cost;
				655	}
				656
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	657	int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
				658	const Instruction *I) {
				659	unsigned DstScalarBits = Dst->getScalarSizeInBits();
				660	unsigned SrcScalarBits = Src->getScalarSizeInBits();
				661
				662	if (Src->isVectorTy()) {
				663	assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
				664	assert (Dst->isVectorTy());
				665	unsigned VF = Src->getVectorNumElements();
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	666	unsigned NumDstVectors = getNumVectorRegs(Dst);
				667	unsigned NumSrcVectors = getNumVectorRegs(Src);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	668
				669	if (Opcode == Instruction::Trunc) {
				670	if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
				671	return 0; // Check for NOOP conversions.
				672	return getVectorTruncCost(Src, Dst);
				673	}
				674
				675	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
				676	if (SrcScalarBits >= 8) {
				677	// ZExt/SExt will be handled with one unpack per doubling of width.
				678	unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
				679
				680	// For types that spans multiple vector registers, some additional
				681	// instructions are used to setup the unpacking.
				682	unsigned NumSrcVectorOps =
				683	(NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
				684	: (NumDstVectors / 2));
				685
				686	return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
				687	}
Jonas Paulsson	f15a53b	2018-11-01 09:01:51 +0000	[diff] [blame]	688	else if (SrcScalarBits == 1)
				689	return getBoolVecToIntConversionCost(Opcode, Dst, I);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	690	}
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	691
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	692	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP \|\|
				693	Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) {
				694	// TODO: Fix base implementation which could simplify things a bit here
				695	// (seems to miss on differentiating on scalar/vector types).
				696
				697	// Only 64 bit vector conversions are natively supported.
Jonas Paulsson	f15a53b	2018-11-01 09:01:51 +0000	[diff] [blame]	698	if (DstScalarBits == 64) {
				699	if (SrcScalarBits == 64)
				700	return NumDstVectors;
				701
				702	if (SrcScalarBits == 1)
				703	return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
				704	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	705
				706	// Return the cost of multiple scalar invocation plus the cost of
				707	// inserting and extracting the values. Base implementation does not
				708	// realize float->int gets scalarized.
				709	unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
				710	Src->getScalarType());
				711	unsigned TotCost = VF * ScalarCost;
				712	bool NeedsInserts = true, NeedsExtracts = true;
				713	// FP128 registers do not get inserted or extracted.
				714	if (DstScalarBits == 128 &&
				715	(Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP))
				716	NeedsInserts = false;
				717	if (SrcScalarBits == 128 &&
				718	(Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI))
				719	NeedsExtracts = false;
				720
Jonas Paulsson	5cea85d	2018-11-12 15:32:27 +0000	[diff] [blame]	721	TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
				722	TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	723
				724	// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
				725	if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
				726	TotCost *= 2;
				727
				728	return TotCost;
				729	}
				730
				731	if (Opcode == Instruction::FPTrunc) {
				732	if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
				733	return VF /ldxbr/lexbr/ + getScalarizationOverhead(Dst, true, false);
				734	else // double -> float
				735	return VF / 2 /vledb/ + std::max(1U, VF / 4 /vperm/);
				736	}
				737
				738	if (Opcode == Instruction::FPExt) {
				739	if (SrcScalarBits == 32 && DstScalarBits == 64) {
				740	// float -> double is very rare and currently unoptimized. Instead of
				741	// using vldeb, which can do two at a time, all conversions are
				742	// scalarized.
				743	return VF * 2;
				744	}
				745	// -> fp128. VF * lxdb/lxeb + extraction of elements.
				746	return VF + getScalarizationOverhead(Src, false, true);
				747	}
				748	}
				749	else { // Scalar
				750	assert (!Dst->isVectorTy());
				751
Jonas Paulsson	cced2a2	2018-11-02 17:53:31 +0000	[diff] [blame]	752	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP) {
				753	if (SrcScalarBits >= 32 \|\|
				754	(I != nullptr && isa<LoadInst>(I->getOperand(0))))
				755	return 1;
				756	return SrcScalarBits > 1 ? 2 /i8/i16 extend/ : 5 /branch seq./;
				757	}
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	758
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	759	if ((Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) &&
				760	Src->isIntegerTy(1)) {
Jonas Paulsson	77df2f2	2018-09-14 06:46:55 +0000	[diff] [blame]	761	if (ST->hasLoadStoreOnCond2())
				762	return 2; // li 0; loc 1
				763
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	764	// This should be extension of a compare i1 result, which is done with
				765	// ipm and a varying sequence of instructions.
				766	unsigned Cost = 0;
				767	if (Opcode == Instruction::SExt)
				768	Cost = (DstScalarBits < 64 ? 3 : 4);
				769	if (Opcode == Instruction::ZExt)
				770	Cost = 3;
				771	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
				772	if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
				773	// If operands of an fp-type was compared, this costs +1.
				774	Cost++;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	775	return Cost;
				776	}
				777	}
				778
				779	return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
				780	}
				781
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	782	int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				783	Type CondTy, const Instruction I) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	784	if (ValTy->isVectorTy()) {
				785	assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	786	unsigned VF = ValTy->getVectorNumElements();
				787
				788	// Called with a compare instruction.
				789	if (Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) {
				790	unsigned PredicateExtraCost = 0;
				791	if (I != nullptr) {
				792	// Some predicates cost one or two extra instructions.
Craig Topper	781aa18	2018-05-05 01:57:00 +0000	[diff] [blame]	793	switch (cast<CmpInst>(I)->getPredicate()) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	794	case CmpInst::Predicate::ICMP_NE:
				795	case CmpInst::Predicate::ICMP_UGE:
				796	case CmpInst::Predicate::ICMP_ULE:
				797	case CmpInst::Predicate::ICMP_SGE:
				798	case CmpInst::Predicate::ICMP_SLE:
				799	PredicateExtraCost = 1;
				800	break;
				801	case CmpInst::Predicate::FCMP_ONE:
				802	case CmpInst::Predicate::FCMP_ORD:
				803	case CmpInst::Predicate::FCMP_UEQ:
				804	case CmpInst::Predicate::FCMP_UNO:
				805	PredicateExtraCost = 2;
				806	break;
				807	default:
				808	break;
				809	}
				810	}
				811
				812	// Float is handled with 2vmr[lh]f + 2vldeb + vfchdb for each pair of
				813	// floats. FIXME: <2 x float> generates same code as <4 x float>.
				814	unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	815	unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	816
				817	unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
				818	return Cost;
				819	}
				820	else { // Called with a select instruction.
				821	assert (Opcode == Instruction::Select);
				822
				823	// We can figure out the extra cost of packing / unpacking if the
				824	// instruction was passed and the compare instruction is found.
				825	unsigned PackCost = 0;
				826	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
				827	if (CmpOpTy != nullptr)
				828	PackCost =
				829	getVectorBitmaskConversionCost(CmpOpTy, ValTy);
				830
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	831	return getNumVectorRegs(ValTy) /vsel/ + PackCost;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	832	}
				833	}
				834	else { // Scalar
				835	switch (Opcode) {
				836	case Instruction::ICmp: {
				837	unsigned Cost = 1;
				838	if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
				839	Cost += 2; // extend both operands
				840	return Cost;
				841	}
				842	case Instruction::Select:
				843	if (ValTy->isFloatingPointTy())
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	844	return 4; // No load on condition for FP - costs a conditional jump.
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	845	return 1; // Load On Condition.
				846	}
				847	}
				848
				849	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
				850	}
				851
				852	int SystemZTTIImpl::
				853	getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
				854	// vlvgp will insert two grs into a vector register, so only count half the
				855	// number of instructions.
Craig Topper	fde4723	2017-07-09 07:04:03 +0000	[diff] [blame]	856	if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	857	return ((Index % 2 == 0) ? 1 : 0);
				858
				859	if (Opcode == Instruction::ExtractElement) {
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	860	int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /+test-under-mask/ : 1);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	861
				862	// Give a slight penalty for moving out of vector pipeline to FXU unit.
Craig Topper	95d2347	2017-07-09 07:04:00 +0000	[diff] [blame]	863	if (Index == 0 && Val->isIntOrIntVectorTy())
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	864	Cost += 1;
				865
				866	return Cost;
				867	}
				868
				869	return BaseT::getVectorInstrCost(Opcode, Val, Index);
				870	}
				871
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	872	// Check if a load may be folded as a memory operand in its user.
				873	bool SystemZTTIImpl::
				874	isFoldableLoad(const LoadInst Ld, const Instruction &FoldedValue) {
				875	if (!Ld->hasOneUse())
				876	return false;
				877	FoldedValue = Ld;
				878	const Instruction UserI = cast<Instruction>(Ld->user_begin());
				879	unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
				880	unsigned TruncBits = 0;
				881	unsigned SExtBits = 0;
				882	unsigned ZExtBits = 0;
				883	if (UserI->hasOneUse()) {
				884	unsigned UserBits = UserI->getType()->getScalarSizeInBits();
				885	if (isa<TruncInst>(UserI))
				886	TruncBits = UserBits;
				887	else if (isa<SExtInst>(UserI))
				888	SExtBits = UserBits;
				889	else if (isa<ZExtInst>(UserI))
				890	ZExtBits = UserBits;
				891	}
				892	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
				893	FoldedValue = UserI;
				894	UserI = cast<Instruction>(*UserI->user_begin());
				895	// Load (single use) -> trunc/extend (single use) -> UserI
				896	}
Jonas Paulsson	af8e036	2018-10-30 13:41:03 +0000	[diff] [blame]	897	if ((UserI->getOpcode() == Instruction::Sub \|\|
				898	UserI->getOpcode() == Instruction::SDiv \|\|
				899	UserI->getOpcode() == Instruction::UDiv) &&
				900	UserI->getOperand(1) != FoldedValue)
				901	return false; // Not commutative, only RHS foldable.
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	902	switch (UserI->getOpcode()) {
				903	case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
				904	case Instruction::Sub:
				905	if (LoadedBits == 32 && ZExtBits == 64)
				906	return true;
				907	LLVM_FALLTHROUGH;
				908	case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
				909	if (LoadedBits == 16 &&
				910	(SExtBits == 32 \|\|
				911	(SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
				912	return true;
				913	LLVM_FALLTHROUGH;
				914	case Instruction::SDiv:// SE: 32->64
				915	if (LoadedBits == 32 && SExtBits == 64)
				916	return true;
				917	LLVM_FALLTHROUGH;
				918	case Instruction::UDiv:
				919	case Instruction::And:
				920	case Instruction::Or:
				921	case Instruction::Xor:
				922	case Instruction::ICmp:
				923	// This also makes sense for float operations, but disabled for now due
				924	// to regressions.
				925	// case Instruction::FCmp:
				926	// case Instruction::FAdd:
				927	// case Instruction::FSub:
				928	// case Instruction::FMul:
				929	// case Instruction::FDiv:
				930
				931	// All possible extensions of memory checked above.
				932	if (SExtBits \|\| ZExtBits)
				933	return false;
				934
				935	unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
				936	return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);
				937	break;
				938	}
				939	return false;
				940	}
				941
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	942	int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
				943	unsigned Alignment, unsigned AddressSpace,
				944	const Instruction *I) {
				945	assert(!Src->isVoidTy() && "Invalid type");
				946
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	947	if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
				948	// Store the load or its truncated or extended value in FoldedValue.
				949	const Instruction *FoldedValue = nullptr;
				950	if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
				951	const Instruction UserI = cast<Instruction>(FoldedValue->user_begin());
				952	assert (UserI->getNumOperands() == 2 && "Expected a binop.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	953
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	954	// UserI can't fold two loads, so in that case return 0 cost only
				955	// half of the time.
				956	for (unsigned i = 0; i < 2; ++i) {
				957	if (UserI->getOperand(i) == FoldedValue)
				958	continue;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	959
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	960	if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
				961	LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
				962	if (!OtherLoad &&
				963	(isa<TruncInst>(OtherOp) \|\| isa<SExtInst>(OtherOp) \|\|
				964	isa<ZExtInst>(OtherOp)))
				965	OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
				966	if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/dummy/))
				967	return i == 0; // Both operands foldable.
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	968	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	969	}
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	970
				971	return 0; // Only I is foldable in user.
				972	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	973	}
				974
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	975	unsigned NumOps =
				976	(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	977
				978	if (Src->getScalarSizeInBits() == 128)
				979	// 128 bit scalars are held in a pair of two 64 bit registers.
				980	NumOps *= 2;
				981
				982	return NumOps;
				983	}
				984
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	985	// The generic implementation of getInterleavedMemoryOpCost() is based on
				986	// adding costs of the memory operations plus all the extracts and inserts
				987	// needed for using / defining the vector operands. The SystemZ version does
				988	// roughly the same but bases the computations on vector permutations
				989	// instead.
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	990	int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
				991	unsigned Factor,
				992	ArrayRef<unsigned> Indices,
				993	unsigned Alignment,
Dorit Nuzman	38bbf81	2018-10-14 08:50:06 +0000	[diff] [blame]	994	unsigned AddressSpace,
Dorit Nuzman	34da6dd	2018-10-31 09:57:56 +0000	[diff] [blame]	995	bool UseMaskForCond,
				996	bool UseMaskForGaps) {
				997	if (UseMaskForCond \|\| UseMaskForGaps)
Dorit Nuzman	38bbf81	2018-10-14 08:50:06 +0000	[diff] [blame]	998	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Dorit Nuzman	34da6dd	2018-10-31 09:57:56 +0000	[diff] [blame]	999	Alignment, AddressSpace,
				1000	UseMaskForCond, UseMaskForGaps);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1001	assert(isa<VectorType>(VecTy) &&
				1002	"Expect a vector type for interleaved memory op");
				1003
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	1004	// Return the ceiling of dividing A by B.
				1005	auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1006
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	1007	unsigned NumElts = VecTy->getVectorNumElements();
				1008	assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
				1009	unsigned VF = NumElts / Factor;
				1010	unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
				1011	unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
				1012	unsigned NumPermutes = 0;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1013
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	1014	if (Opcode == Instruction::Load) {
				1015	// Loading interleave groups may have gaps, which may mean fewer
				1016	// loads. Find out how many vectors will be loaded in total, and in how
				1017	// many of them each value will be in.
				1018	BitVector UsedInsts(NumVectorMemOps, false);
				1019	std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
				1020	for (unsigned Index : Indices)
				1021	for (unsigned Elt = 0; Elt < VF; ++Elt) {
				1022	unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
				1023	UsedInsts.set(Vec);
				1024	ValueVecs[Index].set(Vec);
				1025	}
				1026	NumVectorMemOps = UsedInsts.count();
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1027
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	1028	for (unsigned Index : Indices) {
				1029	// Estimate that each loaded source vector containing this Index
				1030	// requires one operation, except that vperm can handle two input
				1031	// registers first time for each dst vector.
				1032	unsigned NumSrcVecs = ValueVecs[Index].count();
				1033	unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
				1034	assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
				1035	NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
				1036	}
				1037	} else {
				1038	// Estimate the permutes for each stored vector as the smaller of the
				1039	// number of elements and the number of source vectors. Subtract one per
				1040	// dst vector for vperm (S.A.).
				1041	unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
				1042	unsigned NumDstVecs = NumVectorMemOps;
				1043	assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
				1044	NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
				1045	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1046
				1047	// Cost of load/store operations and the permutations needed.
Jonas Paulsson	79f2441	2018-11-02 17:15:36 +0000	[diff] [blame]	1048	return NumVectorMemOps + NumPermutes;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	1049	}
Jonas Paulsson	96782c2	2018-11-22 07:17:29 +0000	[diff] [blame]	1050
				1051	static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
				1052	if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
				1053	return getNumVectorRegs(RetTy); // VPERM
				1054	return -1;
				1055	}
				1056
				1057	int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
				1058	ArrayRef<Value *> Args,
				1059	FastMathFlags FMF, unsigned VF) {
				1060	int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
				1061	if (Cost != -1)
				1062	return Cost;
				1063	return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
				1064	}
				1065
				1066	int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
				1067	ArrayRef<Type *> Tys,
				1068	FastMathFlags FMF,
				1069	unsigned ScalarizationCostPassed) {
				1070	int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
				1071	if (Cost != -1)
				1072	return Cost;
				1073	return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
				1074	FMF, ScalarizationCostPassed);
				1075	}