Blame - llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp - toolchain/llvm-project

blob: 94db56e3738f1ad22bf9d47fba6df40fb2e4973c [file] [log] [blame]

Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	1	//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This file implements a TargetTransformInfo analysis pass specific to the
				11	// SystemZ target machine. It uses the target's detailed information to provide
				12	// more precise answers to certain TTI queries, while letting the target
				13	// independent and default TTI implementations handle the rest.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17	#include "SystemZTargetTransformInfo.h"
				18	#include "llvm/Analysis/TargetTransformInfo.h"
				19	#include "llvm/CodeGen/BasicTTIImpl.h"
David Blaikie	b3bde2e	2017-11-17 01:07:10 +0000	[diff] [blame]	20	#include "llvm/CodeGen/CostTable.h"
				21	#include "llvm/CodeGen/TargetLowering.h"
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	22	#include "llvm/IR/IntrinsicInst.h"
				23	#include "llvm/Support/Debug.h"
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	24	using namespace llvm;
				25
				26	#define DEBUG_TYPE "systemztti"
				27
				28	//===----------------------------------------------------------------------===//
				29	//
				30	// SystemZ cost model.
				31	//
				32	//===----------------------------------------------------------------------===//
				33
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	34	int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	35	assert(Ty->isIntegerTy());
				36
				37	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				38	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				39	// here, so that constant hoisting will ignore this constant.
				40	if (BitSize == 0)
				41	return TTI::TCC_Free;
				42	// No cost model for operations on integers larger than 64 bit implemented yet.
				43	if (BitSize > 64)
				44	return TTI::TCC_Free;
				45
				46	if (Imm == 0)
				47	return TTI::TCC_Free;
				48
				49	if (Imm.getBitWidth() <= 64) {
				50	// Constants loaded via lgfi.
				51	if (isInt<32>(Imm.getSExtValue()))
				52	return TTI::TCC_Basic;
				53	// Constants loaded via llilf.
				54	if (isUInt<32>(Imm.getZExtValue()))
				55	return TTI::TCC_Basic;
				56	// Constants loaded via llihf:
				57	if ((Imm.getZExtValue() & 0xffffffff) == 0)
				58	return TTI::TCC_Basic;
				59
				60	return 2 * TTI::TCC_Basic;
				61	}
				62
				63	return 4 * TTI::TCC_Basic;
				64	}
				65
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	66	int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
				67	const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	68	assert(Ty->isIntegerTy());
				69
				70	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				71	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				72	// here, so that constant hoisting will ignore this constant.
				73	if (BitSize == 0)
				74	return TTI::TCC_Free;
				75	// No cost model for operations on integers larger than 64 bit implemented yet.
				76	if (BitSize > 64)
				77	return TTI::TCC_Free;
				78
				79	switch (Opcode) {
				80	default:
				81	return TTI::TCC_Free;
				82	case Instruction::GetElementPtr:
				83	// Always hoist the base address of a GetElementPtr. This prevents the
				84	// creation of new constants for every base constant that gets constant
				85	// folded with the offset.
				86	if (Idx == 0)
				87	return 2 * TTI::TCC_Basic;
				88	return TTI::TCC_Free;
				89	case Instruction::Store:
				90	if (Idx == 0 && Imm.getBitWidth() <= 64) {
				91	// Any 8-bit immediate store can by implemented via mvi.
				92	if (BitSize == 8)
				93	return TTI::TCC_Free;
				94	// 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
				95	if (isInt<16>(Imm.getSExtValue()))
				96	return TTI::TCC_Free;
				97	}
				98	break;
				99	case Instruction::ICmp:
				100	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				101	// Comparisons against signed 32-bit immediates implemented via cgfi.
				102	if (isInt<32>(Imm.getSExtValue()))
				103	return TTI::TCC_Free;
				104	// Comparisons against unsigned 32-bit immediates implemented via clgfi.
				105	if (isUInt<32>(Imm.getZExtValue()))
				106	return TTI::TCC_Free;
				107	}
				108	break;
				109	case Instruction::Add:
				110	case Instruction::Sub:
				111	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				112	// We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
				113	if (isUInt<32>(Imm.getZExtValue()))
				114	return TTI::TCC_Free;
				115	// Or their negation, by swapping addition vs. subtraction.
				116	if (isUInt<32>(-Imm.getSExtValue()))
				117	return TTI::TCC_Free;
				118	}
				119	break;
				120	case Instruction::Mul:
				121	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				122	// We use msgfi to multiply by 32-bit signed immediates.
				123	if (isInt<32>(Imm.getSExtValue()))
				124	return TTI::TCC_Free;
				125	}
				126	break;
				127	case Instruction::Or:
				128	case Instruction::Xor:
				129	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				130	// Masks supported by oilf/xilf.
				131	if (isUInt<32>(Imm.getZExtValue()))
				132	return TTI::TCC_Free;
				133	// Masks supported by oihf/xihf.
				134	if ((Imm.getZExtValue() & 0xffffffff) == 0)
				135	return TTI::TCC_Free;
				136	}
				137	break;
				138	case Instruction::And:
				139	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				140	// Any 32-bit AND operation can by implemented via nilf.
				141	if (BitSize <= 32)
				142	return TTI::TCC_Free;
				143	// 64-bit masks supported by nilf.
				144	if (isUInt<32>(~Imm.getZExtValue()))
				145	return TTI::TCC_Free;
				146	// 64-bit masks supported by nilh.
				147	if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
				148	return TTI::TCC_Free;
				149	// Some 64-bit AND operations can be implemented via risbg.
				150	const SystemZInstrInfo *TII = ST->getInstrInfo();
				151	unsigned Start, End;
				152	if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
				153	return TTI::TCC_Free;
				154	}
				155	break;
				156	case Instruction::Shl:
				157	case Instruction::LShr:
				158	case Instruction::AShr:
				159	// Always return TCC_Free for the shift value of a shift instruction.
				160	if (Idx == 1)
				161	return TTI::TCC_Free;
				162	break;
				163	case Instruction::UDiv:
				164	case Instruction::SDiv:
				165	case Instruction::URem:
				166	case Instruction::SRem:
				167	case Instruction::Trunc:
				168	case Instruction::ZExt:
				169	case Instruction::SExt:
				170	case Instruction::IntToPtr:
				171	case Instruction::PtrToInt:
				172	case Instruction::BitCast:
				173	case Instruction::PHI:
				174	case Instruction::Call:
				175	case Instruction::Select:
				176	case Instruction::Ret:
				177	case Instruction::Load:
				178	break;
				179	}
				180
				181	return SystemZTTIImpl::getIntImmCost(Imm, Ty);
				182	}
				183
Chandler Carruth	93205eb	2015-08-05 18:08:10 +0000	[diff] [blame]	184	int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
				185	const APInt &Imm, Type *Ty) {
Ulrich Weigand	1f6666a	2015-03-31 12:52:27 +0000	[diff] [blame]	186	assert(Ty->isIntegerTy());
				187
				188	unsigned BitSize = Ty->getPrimitiveSizeInBits();
				189	// There is no cost model for constants with a bit size of 0. Return TCC_Free
				190	// here, so that constant hoisting will ignore this constant.
				191	if (BitSize == 0)
				192	return TTI::TCC_Free;
				193	// No cost model for operations on integers larger than 64 bit implemented yet.
				194	if (BitSize > 64)
				195	return TTI::TCC_Free;
				196
				197	switch (IID) {
				198	default:
				199	return TTI::TCC_Free;
				200	case Intrinsic::sadd_with_overflow:
				201	case Intrinsic::uadd_with_overflow:
				202	case Intrinsic::ssub_with_overflow:
				203	case Intrinsic::usub_with_overflow:
				204	// These get expanded to include a normal addition/subtraction.
				205	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				206	if (isUInt<32>(Imm.getZExtValue()))
				207	return TTI::TCC_Free;
				208	if (isUInt<32>(-Imm.getSExtValue()))
				209	return TTI::TCC_Free;
				210	}
				211	break;
				212	case Intrinsic::smul_with_overflow:
				213	case Intrinsic::umul_with_overflow:
				214	// These get expanded to include a normal multiplication.
				215	if (Idx == 1 && Imm.getBitWidth() <= 64) {
				216	if (isInt<32>(Imm.getSExtValue()))
				217	return TTI::TCC_Free;
				218	}
				219	break;
				220	case Intrinsic::experimental_stackmap:
				221	if ((Idx < 2) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				222	return TTI::TCC_Free;
				223	break;
				224	case Intrinsic::experimental_patchpoint_void:
				225	case Intrinsic::experimental_patchpoint_i64:
				226	if ((Idx < 4) \|\| (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
				227	return TTI::TCC_Free;
				228	break;
				229	}
				230	return SystemZTTIImpl::getIntImmCost(Imm, Ty);
				231	}
Ulrich Weigand	b401218	2015-03-31 12:56:33 +0000	[diff] [blame]	232
				233	TargetTransformInfo::PopcntSupportKind
				234	SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
				235	assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
				236	if (ST->hasPopulationCount() && TyWidth <= 64)
				237	return TTI::PSK_FastHardware;
				238	return TTI::PSK_Software;
				239	}
				240
Geoff Berry	66d9bdb	2017-06-28 15:53:17 +0000	[diff] [blame]	241	void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	242	TTI::UnrollingPreferences &UP) {
				243	// Find out if L contains a call, what the machine instruction count
				244	// estimate is, and how many stores there are.
				245	bool HasCall = false;
				246	unsigned NumStores = 0;
				247	for (auto &BB : L->blocks())
				248	for (auto &I : *BB) {
				249	if (isa<CallInst>(&I) \|\| isa<InvokeInst>(&I)) {
				250	ImmutableCallSite CS(&I);
				251	if (const Function *F = CS.getCalledFunction()) {
				252	if (isLoweredToCall(F))
				253	HasCall = true;
				254	if (F->getIntrinsicID() == Intrinsic::memcpy \|\|
				255	F->getIntrinsicID() == Intrinsic::memset)
				256	NumStores++;
				257	} else { // indirect call.
				258	HasCall = true;
				259	}
				260	}
				261	if (isa<StoreInst>(&I)) {
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	262	Type *MemAccessTy = I.getOperand(0)->getType();
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	263	NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
Jonas Paulsson	58c5a7f	2016-09-28 09:41:38 +0000	[diff] [blame]	264	}
				265	}
				266
				267	// The z13 processor will run out of store tags if too many stores
				268	// are fed into it too quickly. Therefore make sure there are not
				269	// too many stores in the resulting unrolled loop.
				270	unsigned const Max = (NumStores ? (12 / NumStores) : UINT_MAX);
				271
				272	if (HasCall) {
				273	// Only allow full unrolling if loop has any calls.
				274	UP.FullUnrollMaxCount = Max;
				275	UP.MaxCount = 1;
				276	return;
				277	}
				278
				279	UP.MaxCount = Max;
				280	if (UP.MaxCount <= 1)
				281	return;
				282
				283	// Allow partial and runtime trip count unrolling.
				284	UP.Partial = UP.Runtime = true;
				285
				286	UP.PartialThreshold = 75;
				287	UP.DefaultUnrollRuntimeCount = 4;
				288
				289	// Allow expensive instructions in the pre-header of the loop.
				290	UP.AllowExpensiveTripCount = true;
				291
				292	UP.Force = true;
				293	}
				294
Jonas Paulsson	024e319	2017-07-21 11:59:37 +0000	[diff] [blame]	295
				296	bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
				297	TargetTransformInfo::LSRCost &C2) {
				298	// SystemZ specific: check instruction count (first), and don't care about
				299	// ImmCost, since offsets are checked explicitly.
				300	return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
				301	C1.NumIVMuls, C1.NumBaseAdds,
				302	C1.ScaleCost, C1.SetupCost) <
				303	std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
				304	C2.NumIVMuls, C2.NumBaseAdds,
				305	C2.ScaleCost, C2.SetupCost);
				306	}
				307
Ulrich Weigand	ce4c109	2015-05-05 19:25:42 +0000	[diff] [blame]	308	unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
				309	if (!Vector)
				310	// Discount the stack pointer. Also leave out %r0, since it can't
				311	// be used in an address.
				312	return 14;
				313	if (ST->hasVector())
				314	return 32;
				315	return 0;
				316	}
				317
Daniel Neilson	c0112ae	2017-06-12 14:22:21 +0000	[diff] [blame]	318	unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
Ulrich Weigand	ce4c109	2015-05-05 19:25:42 +0000	[diff] [blame]	319	if (!Vector)
				320	return 64;
				321	if (ST->hasVector())
				322	return 128;
				323	return 0;
				324	}
				325
Jonas Paulsson	e54cc1a	2017-11-06 13:10:31 +0000	[diff] [blame]	326	bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
				327	EVT VT = TLI->getValueType(DL, DataType);
				328	return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
				329	}
				330
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	331	// Return the bit size for the scalar type or vector element
				332	// type. getScalarSizeInBits() returns 0 for a pointer type.
				333	static unsigned getScalarSizeInBits(Type *Ty) {
				334	unsigned Size =
				335	(Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
				336	assert(Size > 0 && "Element must have non-zero size.");
				337	return Size;
				338	}
				339
				340	// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
				341	// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
				342	// 3.
				343	static unsigned getNumVectorRegs(Type *Ty) {
				344	assert(Ty->isVectorTy() && "Expected vector type");
				345	unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
				346	assert(WideBits > 0 && "Could not compute size of vector");
				347	return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
				348	}
				349
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	350	int SystemZTTIImpl::getArithmeticInstrCost(
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	351	unsigned Opcode, Type *Ty,
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	352	TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
				353	TTI::OperandValueProperties Opd1PropInfo,
				354	TTI::OperandValueProperties Opd2PropInfo,
				355	ArrayRef<const Value *> Args) {
				356
				357	// TODO: return a good value for BB-VECTORIZER that includes the
				358	// immediate loads, which we do not want to count for the loop
				359	// vectorizer, since they are hopefully hoisted out of the loop. This
				360	// would require a new parameter 'InLoop', but not sure if constant
				361	// args are common enough to motivate this.
				362
				363	unsigned ScalarBits = Ty->getScalarSizeInBits();
				364
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	365	// There are thre cases of division and remainder: Dividing with a register
				366	// needs a divide instruction. A divisor which is a power of two constant
				367	// can be implemented with a sequence of shifts. Any other constant needs a
				368	// multiply and shifts.
				369	const unsigned DivInstrCost = 20;
				370	const unsigned DivMulSeqCost = 10;
				371	const unsigned SDivPow2Cost = 4;
				372
				373	bool SignedDivRem =
				374	Opcode == Instruction::SDiv \|\| Opcode == Instruction::SRem;
				375	bool UnsignedDivRem =
				376	Opcode == Instruction::UDiv \|\| Opcode == Instruction::URem;
				377
				378	// Check for a constant divisor.
				379	bool DivRemConst = false;
				380	bool DivRemConstPow2 = false;
				381	if ((SignedDivRem \|\| UnsignedDivRem) && Args.size() == 2) {
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	382	if (const Constant *C = dyn_cast<Constant>(Args[1])) {
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	383	const ConstantInt *CVal =
				384	(C->getType()->isVectorTy()
				385	? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
				386	: dyn_cast<const ConstantInt>(C));
				387	if (CVal != nullptr &&
				388	(CVal->getValue().isPowerOf2() \|\| (-CVal->getValue()).isPowerOf2()))
				389	DivRemConstPow2 = true;
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	390	else
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	391	DivRemConst = true;
Jonas Paulsson	8722ade	2017-05-17 12:46:26 +0000	[diff] [blame]	392	}
				393	}
				394
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	395	if (Ty->isVectorTy()) {
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	396	assert(ST->hasVector() &&
				397	"getArithmeticInstrCost() called with vector type.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	398	unsigned VF = Ty->getVectorNumElements();
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	399	unsigned NumVectors = getNumVectorRegs(Ty);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	400
				401	// These vector operations are custom handled, but are still supported
				402	// with one instruction per vector, regardless of element size.
				403	if (Opcode == Instruction::Shl \|\| Opcode == Instruction::LShr \|\|
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	404	Opcode == Instruction::AShr) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	405	return NumVectors;
				406	}
				407
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	408	if (DivRemConstPow2)
				409	return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
				410	if (DivRemConst)
				411	return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
				412	if ((SignedDivRem \|\| UnsignedDivRem) && VF > 4)
				413	// Temporary hack: disable high vectorization factors with integer
				414	// division/remainder, which will get scalarized and handled with
				415	// GR128 registers. The mischeduler is not clever enough to avoid
				416	// spilling yet.
Jonas Paulsson	bf66f38	2018-10-10 09:30:29 +0000	[diff] [blame]	417	return 1000;
				418
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	419	// These FP operations are supported with a single vector instruction for
				420	// double (base implementation assumes float generally costs 2). For
				421	// FP128, the scalar cost is 1, and there is no overhead since the values
				422	// are already in scalar registers.
				423	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
				424	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) {
				425	switch (ScalarBits) {
				426	case 32: {
Ulrich Weigand	33435c4	2017-07-17 17:42:48 +0000	[diff] [blame]	427	// The vector enhancements facility 1 provides v4f32 instructions.
				428	if (ST->hasVectorEnhancements1())
				429	return NumVectors;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	430	// Return the cost of multiple scalar invocation plus the cost of
				431	// inserting and extracting the values.
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	432	unsigned ScalarCost =
				433	getArithmeticInstrCost(Opcode, Ty->getScalarType());
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	434	unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
				435	// FIXME: VF 2 for these FP operations are currently just as
				436	// expensive as for VF 4.
				437	if (VF == 2)
				438	Cost *= 2;
				439	return Cost;
				440	}
				441	case 64:
				442	case 128:
				443	return NumVectors;
				444	default:
				445	break;
				446	}
				447	}
				448
				449	// There is no native support for FRem.
				450	if (Opcode == Instruction::FRem) {
				451	unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
				452	// FIXME: VF 2 for float is currently just as expensive as for VF 4.
				453	if (VF == 2 && ScalarBits == 32)
				454	Cost *= 2;
				455	return Cost;
				456	}
				457	}
				458	else { // Scalar:
				459	// These FP operations are supported with a dedicated instruction for
				460	// float, double and fp128 (base implementation assumes float generally
				461	// costs 2).
				462	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
				463	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv)
				464	return 1;
				465
				466	// There is no native support for FRem.
				467	if (Opcode == Instruction::FRem)
				468	return LIBCALL_COST;
				469
				470	if (Opcode == Instruction::LShr \|\| Opcode == Instruction::AShr)
				471	return (ScalarBits >= 32 ? 1 : 2 /ext/);
				472
				473	// Or requires one instruction, although it has custom handling for i64.
				474	if (Opcode == Instruction::Or)
				475	return 1;
				476
Jonas Paulsson	77df2f2	2018-09-14 06:46:55 +0000	[diff] [blame]	477	if (Opcode == Instruction::Xor && ScalarBits == 1) {
				478	if (ST->hasLoadStoreOnCond2())
				479	return 5; // 2 * (li 0; loc 1); xor
				480	return 7; // 2 * ipm sequences ; xor ; shift ; compare
				481	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	482
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	483	if (DivRemConstPow2)
				484	return (SignedDivRem ? SDivPow2Cost : 1);
				485	if (DivRemConst)
				486	return DivMulSeqCost;
				487	if (SignedDivRem)
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	488	// sext of op(s) for narrow types
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	489	return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
				490	if (UnsignedDivRem)
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	491	// Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
Jonas Paulsson	4645711	2018-10-25 21:47:22 +0000	[diff] [blame]	492	return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	493	}
				494
				495	// Fallback to the default implementation.
				496	return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
				497	Opd1PropInfo, Opd2PropInfo, Args);
				498	}
				499
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	500	int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
				501	Type *SubTp) {
				502	assert (Tp->isVectorTy());
				503	assert (ST->hasVector() && "getShuffleCost() called.");
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	504	unsigned NumVectors = getNumVectorRegs(Tp);
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	505
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	506	// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
				507
				508	// FP128 values are always in scalar registers, so there is no work
				509	// involved with a shuffle, except for broadcast. In that case register
				510	// moves are done with a single instruction per element.
				511	if (Tp->getScalarType()->isFP128Ty())
				512	return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
				513
				514	switch (Kind) {
				515	case TargetTransformInfo::SK_ExtractSubvector:
				516	// ExtractSubvector Index indicates start offset.
				517
				518	// Extracting a subvector from first index is a noop.
				519	return (Index == 0 ? 0 : NumVectors);
				520
				521	case TargetTransformInfo::SK_Broadcast:
				522	// Loop vectorizer calls here to figure out the extra cost of
				523	// broadcasting a loaded value to all elements of a vector. Since vlrep
				524	// loads and replicates with a single instruction, adjust the returned
				525	// value.
				526	return NumVectors - 1;
				527
				528	default:
				529
				530	// SystemZ supports single instruction permutation / replication.
				531	return NumVectors;
				532	}
				533
				534	return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
				535	}
				536
				537	// Return the log2 difference of the element sizes of the two vector types.
				538	static unsigned getElSizeLog2Diff(Type Ty0, Type Ty1) {
				539	unsigned Bits0 = Ty0->getScalarSizeInBits();
				540	unsigned Bits1 = Ty1->getScalarSizeInBits();
				541
				542	if (Bits1 > Bits0)
				543	return (Log2_32(Bits1) - Log2_32(Bits0));
				544
				545	return (Log2_32(Bits0) - Log2_32(Bits1));
				546	}
				547
				548	// Return the number of instructions needed to truncate SrcTy to DstTy.
				549	unsigned SystemZTTIImpl::
				550	getVectorTruncCost(Type SrcTy, Type DstTy) {
				551	assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
				552	assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
				553	"Packing must reduce size of vector type.");
				554	assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
				555	"Packing should not change number of elements.");
				556
				557	// TODO: Since fp32 is expanded, the extract cost should always be 0.
				558
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	559	unsigned NumParts = getNumVectorRegs(SrcTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	560	if (NumParts <= 2)
				561	// Up to 2 vector registers can be truncated efficiently with pack or
				562	// permute. The latter requires an immediate mask to be loaded, which
				563	// typically gets hoisted out of a loop. TODO: return a good value for
				564	// BB-VECTORIZER that includes the immediate loads, which we do not want
				565	// to count for the loop vectorizer.
				566	return 1;
				567
				568	unsigned Cost = 0;
				569	unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
				570	unsigned VF = SrcTy->getVectorNumElements();
				571	for (unsigned P = 0; P < Log2Diff; ++P) {
				572	if (NumParts > 1)
				573	NumParts /= 2;
				574	Cost += NumParts;
				575	}
				576
				577	// Currently, a general mix of permutes and pack instructions is output by
				578	// isel, which follow the cost computation above except for this case which
				579	// is one instruction less:
				580	if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
				581	DstTy->getScalarSizeInBits() == 8)
				582	Cost--;
				583
				584	return Cost;
				585	}
				586
				587	// Return the cost of converting a vector bitmask produced by a compare
				588	// (SrcTy), to the type of the select or extend instruction (DstTy).
				589	unsigned SystemZTTIImpl::
				590	getVectorBitmaskConversionCost(Type SrcTy, Type DstTy) {
				591	assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
				592	"Should only be called with vector types.");
				593
				594	unsigned PackCost = 0;
				595	unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
				596	unsigned DstScalarBits = DstTy->getScalarSizeInBits();
				597	unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
				598	if (SrcScalarBits > DstScalarBits)
				599	// The bitmask will be truncated.
				600	PackCost = getVectorTruncCost(SrcTy, DstTy);
				601	else if (SrcScalarBits < DstScalarBits) {
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	602	unsigned DstNumParts = getNumVectorRegs(DstTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	603	// Each vector select needs its part of the bitmask unpacked.
				604	PackCost = Log2Diff * DstNumParts;
				605	// Extra cost for moving part of mask before unpacking.
				606	PackCost += DstNumParts - 1;
				607	}
				608
				609	return PackCost;
				610	}
				611
				612	// Return the type of the compared operands. This is needed to compute the
				613	// cost for a Select / ZExt or SExt instruction.
				614	static Type getCmpOpsType(const Instruction I, unsigned VF = 1) {
				615	Type *OpTy = nullptr;
				616	if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
				617	OpTy = CI->getOperand(0)->getType();
				618	else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
Jonas Paulsson	f40eac5	2017-05-03 13:33:45 +0000	[diff] [blame]	619	if (LogicI->getNumOperands() == 2)
				620	if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
				621	if (isa<CmpInst>(LogicI->getOperand(1)))
				622	OpTy = CI0->getOperand(0)->getType();
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	623
				624	if (OpTy != nullptr) {
				625	if (VF == 1) {
				626	assert (!OpTy->isVectorTy() && "Expected scalar type");
				627	return OpTy;
				628	}
				629	// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
				630	// be either scalar or already vectorized with a same or lesser VF.
				631	Type *ElTy = OpTy->getScalarType();
				632	return VectorType::get(ElTy, VF);
				633	}
				634
				635	return nullptr;
				636	}
				637
				638	int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
				639	const Instruction *I) {
				640	unsigned DstScalarBits = Dst->getScalarSizeInBits();
				641	unsigned SrcScalarBits = Src->getScalarSizeInBits();
				642
				643	if (Src->isVectorTy()) {
				644	assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
				645	assert (Dst->isVectorTy());
				646	unsigned VF = Src->getVectorNumElements();
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	647	unsigned NumDstVectors = getNumVectorRegs(Dst);
				648	unsigned NumSrcVectors = getNumVectorRegs(Src);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	649
				650	if (Opcode == Instruction::Trunc) {
				651	if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
				652	return 0; // Check for NOOP conversions.
				653	return getVectorTruncCost(Src, Dst);
				654	}
				655
				656	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
				657	if (SrcScalarBits >= 8) {
				658	// ZExt/SExt will be handled with one unpack per doubling of width.
				659	unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
				660
				661	// For types that spans multiple vector registers, some additional
				662	// instructions are used to setup the unpacking.
				663	unsigned NumSrcVectorOps =
				664	(NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
				665	: (NumDstVectors / 2));
				666
				667	return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
				668	}
				669	else if (SrcScalarBits == 1) {
				670	// This should be extension of a compare i1 result.
				671	// If we know what the widths of the compared operands, get the
				672	// cost of converting it to Dst. Otherwise assume same widths.
				673	unsigned Cost = 0;
				674	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
				675	if (CmpOpTy != nullptr)
				676	Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
				677	if (Opcode == Instruction::ZExt)
				678	// One 'vn' per dst vector with an immediate mask.
				679	Cost += NumDstVectors;
				680	return Cost;
				681	}
				682	}
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	683
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	684	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP \|\|
				685	Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) {
				686	// TODO: Fix base implementation which could simplify things a bit here
				687	// (seems to miss on differentiating on scalar/vector types).
				688
				689	// Only 64 bit vector conversions are natively supported.
				690	if (SrcScalarBits == 64 && DstScalarBits == 64)
				691	return NumDstVectors;
				692
				693	// Return the cost of multiple scalar invocation plus the cost of
				694	// inserting and extracting the values. Base implementation does not
				695	// realize float->int gets scalarized.
				696	unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
				697	Src->getScalarType());
				698	unsigned TotCost = VF * ScalarCost;
				699	bool NeedsInserts = true, NeedsExtracts = true;
				700	// FP128 registers do not get inserted or extracted.
				701	if (DstScalarBits == 128 &&
				702	(Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP))
				703	NeedsInserts = false;
				704	if (SrcScalarBits == 128 &&
				705	(Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI))
				706	NeedsExtracts = false;
				707
				708	TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
				709
				710	// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
				711	if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
				712	TotCost *= 2;
				713
				714	return TotCost;
				715	}
				716
				717	if (Opcode == Instruction::FPTrunc) {
				718	if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
				719	return VF /ldxbr/lexbr/ + getScalarizationOverhead(Dst, true, false);
				720	else // double -> float
				721	return VF / 2 /vledb/ + std::max(1U, VF / 4 /vperm/);
				722	}
				723
				724	if (Opcode == Instruction::FPExt) {
				725	if (SrcScalarBits == 32 && DstScalarBits == 64) {
				726	// float -> double is very rare and currently unoptimized. Instead of
				727	// using vldeb, which can do two at a time, all conversions are
				728	// scalarized.
				729	return VF * 2;
				730	}
				731	// -> fp128. VF * lxdb/lxeb + extraction of elements.
				732	return VF + getScalarizationOverhead(Src, false, true);
				733	}
				734	}
				735	else { // Scalar
				736	assert (!Dst->isVectorTy());
				737
				738	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP)
				739	return (SrcScalarBits >= 32 ? 1 : 2 /i8/i16 extend/);
Fangrui Song	f78650a	2018-07-30 19:41:25 +0000	[diff] [blame]	740
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	741	if ((Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) &&
				742	Src->isIntegerTy(1)) {
Jonas Paulsson	77df2f2	2018-09-14 06:46:55 +0000	[diff] [blame]	743	if (ST->hasLoadStoreOnCond2())
				744	return 2; // li 0; loc 1
				745
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	746	// This should be extension of a compare i1 result, which is done with
				747	// ipm and a varying sequence of instructions.
				748	unsigned Cost = 0;
				749	if (Opcode == Instruction::SExt)
				750	Cost = (DstScalarBits < 64 ? 3 : 4);
				751	if (Opcode == Instruction::ZExt)
				752	Cost = 3;
				753	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
				754	if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
				755	// If operands of an fp-type was compared, this costs +1.
				756	Cost++;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	757	return Cost;
				758	}
				759	}
				760
				761	return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
				762	}
				763
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	764	int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
				765	Type CondTy, const Instruction I) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	766	if (ValTy->isVectorTy()) {
				767	assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	768	unsigned VF = ValTy->getVectorNumElements();
				769
				770	// Called with a compare instruction.
				771	if (Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) {
				772	unsigned PredicateExtraCost = 0;
				773	if (I != nullptr) {
				774	// Some predicates cost one or two extra instructions.
Craig Topper	781aa18	2018-05-05 01:57:00 +0000	[diff] [blame]	775	switch (cast<CmpInst>(I)->getPredicate()) {
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	776	case CmpInst::Predicate::ICMP_NE:
				777	case CmpInst::Predicate::ICMP_UGE:
				778	case CmpInst::Predicate::ICMP_ULE:
				779	case CmpInst::Predicate::ICMP_SGE:
				780	case CmpInst::Predicate::ICMP_SLE:
				781	PredicateExtraCost = 1;
				782	break;
				783	case CmpInst::Predicate::FCMP_ONE:
				784	case CmpInst::Predicate::FCMP_ORD:
				785	case CmpInst::Predicate::FCMP_UEQ:
				786	case CmpInst::Predicate::FCMP_UNO:
				787	PredicateExtraCost = 2;
				788	break;
				789	default:
				790	break;
				791	}
				792	}
				793
				794	// Float is handled with 2vmr[lh]f + 2vldeb + vfchdb for each pair of
				795	// floats. FIXME: <2 x float> generates same code as <4 x float>.
				796	unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	797	unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	798
				799	unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
				800	return Cost;
				801	}
				802	else { // Called with a select instruction.
				803	assert (Opcode == Instruction::Select);
				804
				805	// We can figure out the extra cost of packing / unpacking if the
				806	// instruction was passed and the compare instruction is found.
				807	unsigned PackCost = 0;
				808	Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
				809	if (CmpOpTy != nullptr)
				810	PackCost =
				811	getVectorBitmaskConversionCost(CmpOpTy, ValTy);
				812
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	813	return getNumVectorRegs(ValTy) /vsel/ + PackCost;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	814	}
				815	}
				816	else { // Scalar
				817	switch (Opcode) {
				818	case Instruction::ICmp: {
				819	unsigned Cost = 1;
				820	if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
				821	Cost += 2; // extend both operands
				822	return Cost;
				823	}
				824	case Instruction::Select:
				825	if (ValTy->isFloatingPointTy())
Jonas Paulsson	2b280ea	2018-10-25 22:53:27 +0000	[diff] [blame]	826	return 4; // No load on condition for FP - costs a conditional jump.
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	827	return 1; // Load On Condition.
				828	}
				829	}
				830
				831	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
				832	}
				833
				834	int SystemZTTIImpl::
				835	getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
				836	// vlvgp will insert two grs into a vector register, so only count half the
				837	// number of instructions.
Craig Topper	fde4723	2017-07-09 07:04:03 +0000	[diff] [blame]	838	if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	839	return ((Index % 2 == 0) ? 1 : 0);
				840
				841	if (Opcode == Instruction::ExtractElement) {
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	842	int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /+test-under-mask/ : 1);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	843
				844	// Give a slight penalty for moving out of vector pipeline to FXU unit.
Craig Topper	95d2347	2017-07-09 07:04:00 +0000	[diff] [blame]	845	if (Index == 0 && Val->isIntOrIntVectorTy())
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	846	Cost += 1;
				847
				848	return Cost;
				849	}
				850
				851	return BaseT::getVectorInstrCost(Opcode, Val, Index);
				852	}
				853
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	854	// Check if a load may be folded as a memory operand in its user.
				855	bool SystemZTTIImpl::
				856	isFoldableLoad(const LoadInst Ld, const Instruction &FoldedValue) {
				857	if (!Ld->hasOneUse())
				858	return false;
				859	FoldedValue = Ld;
				860	const Instruction UserI = cast<Instruction>(Ld->user_begin());
				861	unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
				862	unsigned TruncBits = 0;
				863	unsigned SExtBits = 0;
				864	unsigned ZExtBits = 0;
				865	if (UserI->hasOneUse()) {
				866	unsigned UserBits = UserI->getType()->getScalarSizeInBits();
				867	if (isa<TruncInst>(UserI))
				868	TruncBits = UserBits;
				869	else if (isa<SExtInst>(UserI))
				870	SExtBits = UserBits;
				871	else if (isa<ZExtInst>(UserI))
				872	ZExtBits = UserBits;
				873	}
				874	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
				875	FoldedValue = UserI;
				876	UserI = cast<Instruction>(*UserI->user_begin());
				877	// Load (single use) -> trunc/extend (single use) -> UserI
				878	}
Jonas Paulsson	af8e036	2018-10-30 13:41:03 +0000	[diff] [blame]	879	if ((UserI->getOpcode() == Instruction::Sub \|\|
				880	UserI->getOpcode() == Instruction::SDiv \|\|
				881	UserI->getOpcode() == Instruction::UDiv) &&
				882	UserI->getOperand(1) != FoldedValue)
				883	return false; // Not commutative, only RHS foldable.
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	884	switch (UserI->getOpcode()) {
				885	case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
				886	case Instruction::Sub:
				887	if (LoadedBits == 32 && ZExtBits == 64)
				888	return true;
				889	LLVM_FALLTHROUGH;
				890	case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
				891	if (LoadedBits == 16 &&
				892	(SExtBits == 32 \|\|
				893	(SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
				894	return true;
				895	LLVM_FALLTHROUGH;
				896	case Instruction::SDiv:// SE: 32->64
				897	if (LoadedBits == 32 && SExtBits == 64)
				898	return true;
				899	LLVM_FALLTHROUGH;
				900	case Instruction::UDiv:
				901	case Instruction::And:
				902	case Instruction::Or:
				903	case Instruction::Xor:
				904	case Instruction::ICmp:
				905	// This also makes sense for float operations, but disabled for now due
				906	// to regressions.
				907	// case Instruction::FCmp:
				908	// case Instruction::FAdd:
				909	// case Instruction::FSub:
				910	// case Instruction::FMul:
				911	// case Instruction::FDiv:
				912
				913	// All possible extensions of memory checked above.
				914	if (SExtBits \|\| ZExtBits)
				915	return false;
				916
				917	unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
				918	return (LoadOrTruncBits == 32 \|\| LoadOrTruncBits == 64);
				919	break;
				920	}
				921	return false;
				922	}
				923
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	924	int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
				925	unsigned Alignment, unsigned AddressSpace,
				926	const Instruction *I) {
				927	assert(!Src->isVoidTy() && "Invalid type");
				928
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	929	if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
				930	// Store the load or its truncated or extended value in FoldedValue.
				931	const Instruction *FoldedValue = nullptr;
				932	if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
				933	const Instruction UserI = cast<Instruction>(FoldedValue->user_begin());
				934	assert (UserI->getNumOperands() == 2 && "Expected a binop.");
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	935
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	936	// UserI can't fold two loads, so in that case return 0 cost only
				937	// half of the time.
				938	for (unsigned i = 0; i < 2; ++i) {
				939	if (UserI->getOperand(i) == FoldedValue)
				940	continue;
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	941
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	942	if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
				943	LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
				944	if (!OtherLoad &&
				945	(isa<TruncInst>(OtherOp) \|\| isa<SExtInst>(OtherOp) \|\|
				946	isa<ZExtInst>(OtherOp)))
				947	OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
				948	if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/dummy/))
				949	return i == 0; // Both operands foldable.
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	950	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	951	}
Jonas Paulsson	b7caa80	2018-10-25 22:28:25 +0000	[diff] [blame]	952
				953	return 0; // Only I is foldable in user.
				954	}
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	955	}
				956
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	957	unsigned NumOps =
				958	(Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	959
				960	if (Src->getScalarSizeInBits() == 128)
				961	// 128 bit scalars are held in a pair of two 64 bit registers.
				962	NumOps *= 2;
				963
				964	return NumOps;
				965	}
				966
				967	int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
				968	unsigned Factor,
				969	ArrayRef<unsigned> Indices,
				970	unsigned Alignment,
Dorit Nuzman	38bbf81	2018-10-14 08:50:06 +0000	[diff] [blame]	971	unsigned AddressSpace,
Dorit Nuzman	34da6dd	2018-10-31 09:57:56 +0000	[diff] [blame^]	972	bool UseMaskForCond,
				973	bool UseMaskForGaps) {
				974	if (UseMaskForCond \|\| UseMaskForGaps)
Dorit Nuzman	38bbf81	2018-10-14 08:50:06 +0000	[diff] [blame]	975	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Dorit Nuzman	34da6dd	2018-10-31 09:57:56 +0000	[diff] [blame^]	976	Alignment, AddressSpace,
				977	UseMaskForCond, UseMaskForGaps);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	978	assert(isa<VectorType>(VecTy) &&
				979	"Expect a vector type for interleaved memory op");
				980
Jonas Paulsson	2c8b337	2018-10-10 07:36:27 +0000	[diff] [blame]	981	int NumWideParts = getNumVectorRegs(VecTy);
Jonas Paulsson	fccc7d6	2017-04-12 11:49:08 +0000	[diff] [blame]	982
				983	// How many source vectors are handled to produce a vectorized operand?
				984	int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
				985	int NumSrcParts =
				986	((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
				987
				988	// A Load group may have gaps.
				989	unsigned NumOperands =
				990	((Opcode == Instruction::Load) ? Indices.size() : Factor);
				991
				992	// Each needed permute takes two vectors as input.
				993	if (NumSrcParts > 1)
				994	NumSrcParts--;
				995	int NumPermutes = NumSrcParts * NumOperands;
				996
				997	// Cost of load/store operations and the permutations needed.
				998	return NumWideParts + NumPermutes;
				999	}