Blame - llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp - toolchain/llvm-project

blob: e7424aebd7f41a4e2d8795b07289669bcf410dd6 [file] [log] [blame]

Duncan P. N. Exon Smith	10be9a8	2014-04-21 17:57:07 +0000	[diff] [blame^]	1	//===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// Loops should be simplified before this analysis.
				11	//
				12	//===----------------------------------------------------------------------===//
				13
				14	#define DEBUG_TYPE "block-freq"
				15	#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
				16	#include "llvm/ADT/APFloat.h"
				17	#include "llvm/Support/raw_ostream.h"
				18	#include <deque>
				19
				20	using namespace llvm;
				21
				22	//===----------------------------------------------------------------------===//
				23	//
				24	// PositiveFloat implementation.
				25	//
				26	//===----------------------------------------------------------------------===//
				27	#ifndef _MSC_VER
				28	const int32_t PositiveFloatBase::MaxExponent;
				29	const int32_t PositiveFloatBase::MinExponent;
				30	#endif
				31
				32	static void appendDigit(std::string &Str, unsigned D) {
				33	assert(D < 10);
				34	Str += '0' + D % 10;
				35	}
				36
				37	static void appendNumber(std::string &Str, uint64_t N) {
				38	while (N) {
				39	appendDigit(Str, N % 10);
				40	N /= 10;
				41	}
				42	}
				43
				44	static bool doesRoundUp(char Digit) {
				45	switch (Digit) {
				46	case '5':
				47	case '6':
				48	case '7':
				49	case '8':
				50	case '9':
				51	return true;
				52	default:
				53	return false;
				54	}
				55	}
				56
				57	static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
				58	assert(E >= PositiveFloatBase::MinExponent);
				59	assert(E <= PositiveFloatBase::MaxExponent);
				60
				61	// Find a new E, but don't let it increase past MaxExponent.
				62	int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D);
				63	int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros);
				64	int Shift = 63 - (NewE - E);
				65	assert(Shift <= LeadingZeros);
				66	assert(Shift == LeadingZeros \|\| NewE == PositiveFloatBase::MaxExponent);
				67	D <<= Shift;
				68	E = NewE;
				69
				70	// Check for a denormal.
				71	unsigned AdjustedE = E + 16383;
				72	if (!(D >> 63)) {
				73	assert(E == PositiveFloatBase::MaxExponent);
				74	AdjustedE = 0;
				75	}
				76
				77	// Build the float and print it.
				78	uint64_t RawBits[2] = {D, AdjustedE};
				79	APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
				80	SmallVector<char, 24> Chars;
				81	Float.toString(Chars, Precision, 0);
				82	return std::string(Chars.begin(), Chars.end());
				83	}
				84
				85	static std::string stripTrailingZeros(const std::string &Float) {
				86	size_t NonZero = Float.find_last_not_of('0');
				87	assert(NonZero != std::string::npos && "no . in floating point string");
				88
				89	if (Float[NonZero] == '.')
				90	++NonZero;
				91
				92	return Float.substr(0, NonZero + 1);
				93	}
				94
				95	std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width,
				96	unsigned Precision) {
				97	if (!D)
				98	return "0.0";
				99
				100	// Canonicalize exponent and digits.
				101	uint64_t Above0 = 0;
				102	uint64_t Below0 = 0;
				103	uint64_t Extra = 0;
				104	int ExtraShift = 0;
				105	if (E == 0) {
				106	Above0 = D;
				107	} else if (E > 0) {
				108	if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
				109	D <<= Shift;
				110	E -= Shift;
				111
				112	if (!E)
				113	Above0 = D;
				114	}
				115	} else if (E > -64) {
				116	Above0 = D >> -E;
				117	Below0 = D << (64 + E);
				118	} else if (E > -120) {
				119	Below0 = D >> (-E - 64);
				120	Extra = D << (128 + E);
				121	ExtraShift = -64 - E;
				122	}
				123
				124	// Fall back on APFloat for very small and very large numbers.
				125	if (!Above0 && !Below0)
				126	return toStringAPFloat(D, E, Precision);
				127
				128	// Append the digits before the decimal.
				129	std::string Str;
				130	size_t DigitsOut = 0;
				131	if (Above0) {
				132	appendNumber(Str, Above0);
				133	DigitsOut = Str.size();
				134	} else
				135	appendDigit(Str, 0);
				136	std::reverse(Str.begin(), Str.end());
				137
				138	// Return early if there's nothing after the decimal.
				139	if (!Below0)
				140	return Str + ".0";
				141
				142	// Append the decimal and beyond.
				143	Str += '.';
				144	uint64_t Error = UINT64_C(1) << (64 - Width);
				145
				146	// We need to shift Below0 to the right to make space for calculating
				147	// digits. Save the precision we're losing in Extra.
				148	Extra = (Below0 & 0xf) << 56 \| (Extra >> 8);
				149	Below0 >>= 4;
				150	size_t SinceDot = 0;
				151	size_t AfterDot = Str.size();
				152	do {
				153	if (ExtraShift) {
				154	--ExtraShift;
				155	Error *= 5;
				156	} else
				157	Error *= 10;
				158
				159	Below0 *= 10;
				160	Extra *= 10;
				161	Below0 += (Extra >> 60);
				162	Extra = Extra & (UINT64_MAX >> 4);
				163	appendDigit(Str, Below0 >> 60);
				164	Below0 = Below0 & (UINT64_MAX >> 4);
				165	if (DigitsOut \|\| Str.back() != '0')
				166	++DigitsOut;
				167	++SinceDot;
				168	} while (Error && (Below0 << 4 \| Extra >> 60) >= Error / 2 &&
				169	(!Precision \|\| DigitsOut <= Precision \|\| SinceDot < 2));
				170
				171	// Return early for maximum precision.
				172	if (!Precision \|\| DigitsOut <= Precision)
				173	return stripTrailingZeros(Str);
				174
				175	// Find where to truncate.
				176	size_t Truncate =
				177	std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
				178
				179	// Check if there's anything to truncate.
				180	if (Truncate >= Str.size())
				181	return stripTrailingZeros(Str);
				182
				183	bool Carry = doesRoundUp(Str[Truncate]);
				184	if (!Carry)
				185	return stripTrailingZeros(Str.substr(0, Truncate));
				186
				187	// Round with the first truncated digit.
				188	for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
				189	I != E; ++I) {
				190	if (*I == '.')
				191	continue;
				192	if (*I == '9') {
				193	*I = '0';
				194	continue;
				195	}
				196
				197	++*I;
				198	Carry = false;
				199	break;
				200	}
				201
				202	// Add "1" in front if we still need to carry.
				203	return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
				204	}
				205
				206	raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
				207	int Width, unsigned Precision) {
				208	return OS << toString(D, E, Width, Precision);
				209	}
				210
				211	void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) {
				212	print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
				213	<< "]";
				214	}
				215
				216	static std::pair<uint64_t, int16_t>
				217	getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
				218	if (ShouldRound)
				219	if (!++N)
				220	// Rounding caused an overflow.
				221	return std::make_pair(UINT64_C(1), Shift + 64);
				222	return std::make_pair(N, Shift);
				223	}
				224
				225	std::pair<uint64_t, int16_t> PositiveFloatBase::divide64(uint64_t Dividend,
				226	uint64_t Divisor) {
				227	// Input should be sanitized.
				228	assert(Divisor);
				229	assert(Dividend);
				230
				231	// Minimize size of divisor.
				232	int16_t Shift = 0;
				233	if (int Zeros = countTrailingZeros(Divisor)) {
				234	Shift -= Zeros;
				235	Divisor >>= Zeros;
				236	}
				237
				238	// Check for powers of two.
				239	if (Divisor == 1)
				240	return std::make_pair(Dividend, Shift);
				241
				242	// Maximize size of dividend.
				243	if (int Zeros = countLeadingZeros64(Dividend)) {
				244	Shift -= Zeros;
				245	Dividend <<= Zeros;
				246	}
				247
				248	// Start with the result of a divide.
				249	uint64_t Quotient = Dividend / Divisor;
				250	Dividend %= Divisor;
				251
				252	// Continue building the quotient with long division.
				253	//
				254	// TODO: continue with largers digits.
				255	while (!(Quotient >> 63) && Dividend) {
				256	// Shift Dividend, and check for overflow.
				257	bool IsOverflow = Dividend >> 63;
				258	Dividend <<= 1;
				259	--Shift;
				260
				261	// Divide.
				262	bool DoesDivide = IsOverflow \|\| Divisor <= Dividend;
				263	Quotient = (Quotient << 1) \| uint64_t(DoesDivide);
				264	Dividend -= DoesDivide ? Divisor : 0;
				265	}
				266
				267	// Round.
				268	if (Dividend >= getHalf(Divisor))
				269	if (!++Quotient)
				270	// Rounding caused an overflow in Quotient.
				271	return std::make_pair(UINT64_C(1), Shift + 64);
				272
				273	return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
				274	}
				275
				276	std::pair<uint64_t, int16_t> PositiveFloatBase::multiply64(uint64_t L,
				277	uint64_t R) {
				278	// Separate into two 32-bit digits (U.L).
				279	uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
				280
				281	// Compute cross products.
				282	uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
				283
				284	// Sum into two 64-bit digits.
				285	uint64_t Upper = P1, Lower = P4;
				286	auto addWithCarry = [&](uint64_t N) {
				287	uint64_t NewLower = Lower + (N << 32);
				288	Upper += (N >> 32) + (NewLower < Lower);
				289	Lower = NewLower;
				290	};
				291	addWithCarry(P2);
				292	addWithCarry(P3);
				293
				294	// Check whether the upper digit is empty.
				295	if (!Upper)
				296	return std::make_pair(Lower, 0);
				297
				298	// Shift as little as possible to maximize precision.
				299	unsigned LeadingZeros = countLeadingZeros64(Upper);
				300	int16_t Shift = 64 - LeadingZeros;
				301	if (LeadingZeros)
				302	Upper = Upper << LeadingZeros \| Lower >> Shift;
				303	bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
				304	return getRoundedFloat(Upper, ShouldRound, Shift);
				305	}
				306
				307	//===----------------------------------------------------------------------===//
				308	//
				309	// BlockMass implementation.
				310	//
				311	//===----------------------------------------------------------------------===//
				312	BlockMass &BlockMass::operator*=(const BranchProbability &P) {
				313	uint32_t N = P.getNumerator(), D = P.getDenominator();
				314	assert(D && "divide by 0");
				315	assert(N <= D && "fraction greater than 1");
				316
				317	// Fast path for multiplying by 1.0.
				318	if (!Mass \|\| N == D)
				319	return *this;
				320
				321	// Get as much precision as we can.
				322	int Shift = countLeadingZeros(Mass);
				323	uint64_t ShiftedQuotient = (Mass << Shift) / D;
				324	uint64_t Product = ShiftedQuotient * N >> Shift;
				325
				326	// Now check for what's lost.
				327	uint64_t Left = ShiftedQuotient * (D - N) >> Shift;
				328	uint64_t Lost = Mass - Product - Left;
				329
				330	// TODO: prove this assertion.
				331	assert(Lost <= UINT32_MAX);
				332
				333	// Take the product plus a portion of the spoils.
				334	Mass = Product + Lost * N / D;
				335	return *this;
				336	}
				337
				338	PositiveFloat<uint64_t> BlockMass::toFloat() const {
				339	if (isFull())
				340	return PositiveFloat<uint64_t>(1, 0);
				341	return PositiveFloat<uint64_t>(getMass() + 1, -64);
				342	}
				343
				344	void BlockMass::dump() const { print(dbgs()); }
				345
				346	static char getHexDigit(int N) {
				347	assert(N < 16);
				348	if (N < 10)
				349	return '0' + N;
				350	return 'a' + N - 10;
				351	}
				352	raw_ostream &BlockMass::print(raw_ostream &OS) const {
				353	for (int Digits = 0; Digits < 16; ++Digits)
				354	OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
				355	return OS;
				356	}
				357
				358	//===----------------------------------------------------------------------===//
				359	//
				360	// BlockFrequencyInfoImpl implementation.
				361	//
				362	//===----------------------------------------------------------------------===//
				363	namespace {
				364
				365	typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
				366	typedef BlockFrequencyInfoImplBase::Distribution Distribution;
				367	typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
				368	typedef BlockFrequencyInfoImplBase::Float Float;
				369	typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData;
				370	typedef BlockFrequencyInfoImplBase::Weight Weight;
				371	typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
				372
				373	/// \brief Dithering mass distributer.
				374	///
				375	/// This class splits up a single mass into portions by weight, dithering to
				376	/// spread out error. No mass is lost. The dithering precision depends on the
				377	/// precision of the product of \a BlockMass and \a BranchProbability.
				378	///
				379	/// The distribution algorithm follows.
				380	///
				381	/// 1. Initialize by saving the sum of the weights in \a RemWeight and the
				382	/// mass to distribute in \a RemMass.
				383	///
				384	/// 2. For each portion:
				385	///
				386	/// 1. Construct a branch probability, P, as the portion's weight divided
				387	/// by the current value of \a RemWeight.
				388	/// 2. Calculate the portion's mass as \a RemMass times P.
				389	/// 3. Update \a RemWeight and \a RemMass at each portion by subtracting
				390	/// the current portion's weight and mass.
				391	///
				392	/// Mass is distributed in two ways: full distribution and forward
				393	/// distribution. The latter ignores backedges, and uses the parallel fields
				394	/// \a RemForwardWeight and \a RemForwardMass.
				395	struct DitheringDistributer {
				396	uint32_t RemWeight;
				397	uint32_t RemForwardWeight;
				398
				399	BlockMass RemMass;
				400	BlockMass RemForwardMass;
				401
				402	DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
				403
				404	BlockMass takeLocalMass(uint32_t Weight) {
				405	(void)takeMass(Weight);
				406	return takeForwardMass(Weight);
				407	}
				408	BlockMass takeExitMass(uint32_t Weight) {
				409	(void)takeForwardMass(Weight);
				410	return takeMass(Weight);
				411	}
				412	BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); }
				413
				414	private:
				415	BlockMass takeForwardMass(uint32_t Weight);
				416	BlockMass takeMass(uint32_t Weight);
				417	};
				418	}
				419
				420	DitheringDistributer::DitheringDistributer(Distribution &Dist,
				421	const BlockMass &Mass) {
				422	Dist.normalize();
				423	RemWeight = Dist.Total;
				424	RemForwardWeight = Dist.ForwardTotal;
				425	RemMass = Mass;
				426	RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass();
				427	}
				428
				429	BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) {
				430	// Compute the amount of mass to take.
				431	assert(Weight && "invalid weight");
				432	assert(Weight <= RemForwardWeight);
				433	BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight);
				434
				435	// Decrement totals (dither).
				436	RemForwardWeight -= Weight;
				437	RemForwardMass -= Mass;
				438	return Mass;
				439	}
				440	BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
				441	assert(Weight && "invalid weight");
				442	assert(Weight <= RemWeight);
				443	BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
				444
				445	// Decrement totals (dither).
				446	RemWeight -= Weight;
				447	RemMass -= Mass;
				448	return Mass;
				449	}
				450
				451	void Distribution::add(const BlockNode &Node, uint64_t Amount,
				452	Weight::DistType Type) {
				453	assert(Amount && "invalid weight of 0");
				454	uint64_t NewTotal = Total + Amount;
				455
				456	// Check for overflow. It should be impossible to overflow twice.
				457	bool IsOverflow = NewTotal < Total;
				458	assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
				459	DidOverflow \|= IsOverflow;
				460
				461	// Update the total.
				462	Total = NewTotal;
				463
				464	// Save the weight.
				465	Weight W;
				466	W.TargetNode = Node;
				467	W.Amount = Amount;
				468	W.Type = Type;
				469	Weights.push_back(W);
				470
				471	if (Type == Weight::Backedge)
				472	return;
				473
				474	// Update forward total. Don't worry about overflow here, since then Total
				475	// will exceed 32-bits and they'll both be recomputed in normalize().
				476	ForwardTotal += Amount;
				477	}
				478
				479	static void combineWeight(Weight &W, const Weight &OtherW) {
				480	assert(OtherW.TargetNode.isValid());
				481	if (!W.Amount) {
				482	W = OtherW;
				483	return;
				484	}
				485	assert(W.Type == OtherW.Type);
				486	assert(W.TargetNode == OtherW.TargetNode);
				487	assert(W.Amount < W.Amount + OtherW.Amount);
				488	W.Amount += OtherW.Amount;
				489	}
				490	static void combineWeightsBySorting(WeightList &Weights) {
				491	// Sort so edges to the same node are adjacent.
				492	std::sort(Weights.begin(), Weights.end(),
				493	[](const Weight &L,
				494	const Weight &R) { return L.TargetNode < R.TargetNode; });
				495
				496	// Combine adjacent edges.
				497	WeightList::iterator O = Weights.begin();
				498	for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
				499	++O, (I = L)) {
				500	O = I;
				501
				502	// Find the adjacent weights to the same node.
				503	for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
				504	combineWeight(O, L);
				505	}
				506
				507	// Erase extra entries.
				508	Weights.erase(O, Weights.end());
				509	return;
				510	}
				511	static void combineWeightsByHashing(WeightList &Weights) {
				512	// Collect weights into a DenseMap.
				513	typedef DenseMap<BlockNode::IndexType, Weight> HashTable;
				514	HashTable Combined(NextPowerOf2(2 * Weights.size()));
				515	for (const Weight &W : Weights)
				516	combineWeight(Combined[W.TargetNode.Index], W);
				517
				518	// Check whether anything changed.
				519	if (Weights.size() == Combined.size())
				520	return;
				521
				522	// Fill in the new weights.
				523	Weights.clear();
				524	Weights.reserve(Combined.size());
				525	for (const auto &I : Combined)
				526	Weights.push_back(I.second);
				527	}
				528	static void combineWeights(WeightList &Weights) {
				529	// Use a hash table for many successors to keep this linear.
				530	if (Weights.size() > 128) {
				531	combineWeightsByHashing(Weights);
				532	return;
				533	}
				534
				535	combineWeightsBySorting(Weights);
				536	}
				537	static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
				538	assert(Shift >= 0);
				539	assert(Shift < 64);
				540	if (!Shift)
				541	return N;
				542	return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
				543	}
				544	void Distribution::normalize() {
				545	// Early exit for termination nodes.
				546	if (Weights.empty())
				547	return;
				548
				549	// Only bother if there are multiple successors.
				550	if (Weights.size() > 1)
				551	combineWeights(Weights);
				552
				553	// Early exit when combined into a single successor.
				554	if (Weights.size() == 1) {
				555	Total = 1;
				556	ForwardTotal = Weights.front().Type != Weight::Backedge;
				557	Weights.front().Amount = 1;
				558	return;
				559	}
				560
				561	// Determine how much to shift right so that the total fits into 32-bits.
				562	//
				563	// If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1
				564	// for each weight can cause a 32-bit overflow.
				565	int Shift = 0;
				566	if (DidOverflow)
				567	Shift = 33;
				568	else if (Total > UINT32_MAX)
				569	Shift = 33 - countLeadingZeros(Total);
				570
				571	// Early exit if nothing needs to be scaled.
				572	if (!Shift)
				573	return;
				574
				575	// Recompute the total through accumulation (rather than shifting it) so that
				576	// it's accurate after shifting. ForwardTotal is dirty here anyway.
				577	Total = 0;
				578	ForwardTotal = 0;
				579
				580	// Sum the weights to each node and shift right if necessary.
				581	for (Weight &W : Weights) {
				582	// Scale down below UINT32_MAX. Since Shift is larger than necessary, we
				583	// can round here without concern about overflow.
				584	assert(W.TargetNode.isValid());
				585	W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
				586	assert(W.Amount <= UINT32_MAX);
				587
				588	// Update the total.
				589	Total += W.Amount;
				590	if (W.Type == Weight::Backedge)
				591	continue;
				592
				593	// Update the forward total.
				594	ForwardTotal += W.Amount;
				595	}
				596	assert(Total <= UINT32_MAX);
				597	}
				598
				599	void BlockFrequencyInfoImplBase::clear() {
				600	*this = BlockFrequencyInfoImplBase();
				601	}
				602
				603	/// \brief Clear all memory not needed downstream.
				604	///
				605	/// Releases all memory not used downstream. In particular, saves Freqs.
				606	static void cleanup(BlockFrequencyInfoImplBase &BFI) {
				607	std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
				608	BFI.clear();
				609	BFI.Freqs = std::move(SavedFreqs);
				610	}
				611
				612	/// \brief Get a possibly packaged node.
				613	///
				614	/// Get the node currently representing Node, which could be a containing
				615	/// loop.
				616	///
				617	/// This function should only be called when distributing mass. As long as
				618	/// there are no irreducilbe edges to Node, then it will have complexity O(1)
				619	/// in this context.
				620	///
				621	/// In general, the complexity is O(L), where L is the number of loop headers
				622	/// Node has been packaged into. Since this method is called in the context
				623	/// of distributing mass, L will be the number of loop headers an early exit
				624	/// edge jumps out of.
				625	static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI,
				626	const BlockNode &Node) {
				627	assert(Node.isValid());
				628	if (!BFI.Working[Node.Index].IsPackaged)
				629	return Node;
				630	if (!BFI.Working[Node.Index].ContainingLoop.isValid())
				631	return Node;
				632	return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop);
				633	}
				634
				635	/// \brief Get the appropriate mass for a possible pseudo-node loop package.
				636	///
				637	/// Get appropriate mass for Node. If Node is a loop-header (whose loop has
				638	/// been packaged), returns the mass of its pseudo-node. If it's a node inside
				639	/// a packaged loop, it returns the loop's pseudo-node.
				640	static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI,
				641	const BlockNode &Node) {
				642	assert(Node.isValid());
				643	assert(!BFI.Working[Node.Index].IsPackaged);
				644	if (!BFI.Working[Node.Index].IsAPackage)
				645	return BFI.Working[Node.Index].Mass;
				646
				647	return BFI.getLoopPackage(Node).Mass;
				648	}
				649
				650	void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
				651	const BlockNode &LoopHead,
				652	const BlockNode &Pred,
				653	const BlockNode &Succ,
				654	uint64_t Weight) {
				655	if (!Weight)
				656	Weight = 1;
				657
				658	#ifndef NDEBUG
				659	auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) {
				660	dbgs() << " =>"
				661	<< " [" << Type << "] weight = " << Weight;
				662	if (Succ != LoopHead)
				663	dbgs() << ", succ = " << getBlockName(Succ);
				664	if (Resolved != Succ)
				665	dbgs() << ", resolved = " << getBlockName(Resolved);
				666	dbgs() << "\n";
				667	};
				668	(void)debugSuccessor;
				669	#endif
				670
				671	if (Succ == LoopHead) {
				672	DEBUG(debugSuccessor("backedge", Succ));
				673	Dist.addBackedge(LoopHead, Weight);
				674	return;
				675	}
				676	BlockNode Resolved = getPackagedNode(*this, Succ);
				677	assert(Resolved != LoopHead);
				678
				679	if (Working[Resolved.Index].ContainingLoop != LoopHead) {
				680	DEBUG(debugSuccessor(" exit ", Resolved));
				681	Dist.addExit(Resolved, Weight);
				682	return;
				683	}
				684
				685	if (!LoopHead.isValid() && Resolved < Pred) {
				686	// Irreducible backedge. Skip this edge in the distribution.
				687	DEBUG(debugSuccessor("skipped ", Resolved));
				688	return;
				689	}
				690
				691	DEBUG(debugSuccessor(" local ", Resolved));
				692	Dist.addLocal(Resolved, Weight);
				693	}
				694
				695	void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
				696	const BlockNode &LoopHead, const BlockNode &LocalLoopHead,
				697	Distribution &Dist) {
				698	PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead);
				699	const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits;
				700
				701	// Copy the exit map into Dist.
				702	for (const auto &I : Exits)
				703	addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass());
				704
				705	// We don't need this map any more. Clear it to prevent quadratic memory
				706	// usage in deeply nested loops with irreducible control flow.
				707	LoopPackage.Exits.clear();
				708	}
				709
				710	/// \brief Get the maximum allowed loop scale.
				711	///
				712	/// Gives the maximum number of estimated iterations allowed for a loop.
				713	/// Downstream users have trouble with very large numbers (even within
				714	/// 64-bits). Perhaps they can be changed to use PositiveFloat.
				715	///
				716	/// TODO: change downstream users so that this can be increased or removed.
				717	static Float getMaxLoopScale() { return Float(1, 12); }
				718
				719	/// \brief Compute the loop scale for a loop.
				720	void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) {
				721	// Compute loop scale.
				722	DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n");
				723
				724	// LoopScale == 1 / ExitMass
				725	// ExitMass == HeadMass - BackedgeMass
				726	PackagedLoopData &LoopPackage = getLoopPackage(LoopHead);
				727	BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass;
				728
				729	// Block scale stores the inverse of the scale.
				730	LoopPackage.Scale = ExitMass.toFloat().inverse();
				731
				732	DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
				733	<< " - " << LoopPackage.BackedgeMass << ")\n"
				734	<< " - scale = " << LoopPackage.Scale << "\n");
				735
				736	if (LoopPackage.Scale > getMaxLoopScale()) {
				737	LoopPackage.Scale = getMaxLoopScale();
				738	DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
				739	}
				740	}
				741
				742	/// \brief Package up a loop.
				743	void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) {
				744	DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n");
				745	Working[LoopHead.Index].IsAPackage = true;
				746	for (const BlockNode &M : getLoopPackage(LoopHead).Members) {
				747	DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
				748	Working[M.Index].IsPackaged = true;
				749	}
				750	}
				751
				752	void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
				753	const BlockNode &LoopHead,
				754	Distribution &Dist) {
				755	BlockMass Mass = getPackageMass(*this, Source);
				756	DEBUG(dbgs() << " => mass: " << Mass
				757	<< " ( general \| forward )\n");
				758
				759	// Distribute mass to successors as laid out in Dist.
				760	DitheringDistributer D(Dist, Mass);
				761
				762	#ifndef NDEBUG
				763	auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
				764	const char *Desc) {
				765	dbgs() << " => assign " << M << " (" << D.RemMass << "\|"
				766	<< D.RemForwardMass << ")";
				767	if (Desc)
				768	dbgs() << " [" << Desc << "]";
				769	if (T.isValid())
				770	dbgs() << " to " << getBlockName(T);
				771	dbgs() << "\n";
				772	};
				773	(void)debugAssign;
				774	#endif
				775
				776	PackagedLoopData *LoopPackage = 0;
				777	if (LoopHead.isValid())
				778	LoopPackage = &getLoopPackage(LoopHead);
				779	for (const Weight &W : Dist.Weights) {
				780	// Check for a local edge (forward and non-exit).
				781	if (W.Type == Weight::Local) {
				782	BlockMass Local = D.takeLocalMass(W.Amount);
				783	getPackageMass(*this, W.TargetNode) += Local;
				784	DEBUG(debugAssign(W.TargetNode, Local, nullptr));
				785	continue;
				786	}
				787
				788	// Backedges and exits only make sense if we're processing a loop.
				789	assert(LoopPackage && "backedge or exit outside of loop");
				790
				791	// Check for a backedge.
				792	if (W.Type == Weight::Backedge) {
				793	BlockMass Back = D.takeBackedgeMass(W.Amount);
				794	LoopPackage->BackedgeMass += Back;
				795	DEBUG(debugAssign(BlockNode(), Back, "back"));
				796	continue;
				797	}
				798
				799	// This must be an exit.
				800	assert(W.Type == Weight::Exit);
				801	BlockMass Exit = D.takeExitMass(W.Amount);
				802	LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit));
				803	DEBUG(debugAssign(W.TargetNode, Exit, "exit"));
				804	}
				805	}
				806
				807	static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
				808	const Float &Min, const Float &Max) {
				809	// Scale the Factor to a size that creates integers. Ideally, integers would
				810	// be scaled so that Max == UINT64_MAX so that they can be best
				811	// differentiated. However, the register allocator currently deals poorly
				812	// with large numbers. Instead, push Min up a little from 1 to give some
				813	// room to differentiate small, unequal numbers.
				814	//
				815	// TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
				816	Float ScalingFactor = Min.inverse();
				817	if ((Max / Min).lg() < 60)
				818	ScalingFactor <<= 3;
				819
				820	// Translate the floats to integers.
				821	DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
				822	<< ", factor = " << ScalingFactor << "\n");
				823	for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
				824	Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
				825	BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt<uint64_t>());
				826	DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
				827	<< BFI.Freqs[Index].Floating << ", scaled = " << Scaled
				828	<< ", int = " << BFI.Freqs[Index].Integer << "\n");
				829	}
				830	}
				831
				832	static void scaleBlockData(BlockFrequencyInfoImplBase &BFI,
				833	const BlockNode &Node,
				834	const PackagedLoopData &Loop) {
				835	Float F = Loop.Mass.toFloat() * Loop.Scale;
				836
				837	Float &Current = BFI.Freqs[Node.Index].Floating;
				838	Float Updated = Current * F;
				839
				840	DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => "
				841	<< Updated << "\n");
				842
				843	Current = Updated;
				844	}
				845
				846	/// \brief Unwrap a loop package.
				847	///
				848	/// Visits all the members of a loop, adjusting their BlockData according to
				849	/// the loop's pseudo-node.
				850	static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI,
				851	const BlockNode &Head) {
				852	assert(Head.isValid());
				853
				854	PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head);
				855	DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head)
				856	<< ": mass = " << LoopPackage.Mass
				857	<< ", scale = " << LoopPackage.Scale << "\n");
				858	scaleBlockData(BFI, Head, LoopPackage);
				859
				860	// Propagate the head scale through the loop. Since members are visited in
				861	// RPO, the head scale will be updated by the loop scale first, and then the
				862	// final head scale will be used for updated the rest of the members.
				863	for (const BlockNode &M : LoopPackage.Members) {
				864	const FrequencyData &HeadData = BFI.Freqs[Head.Index];
				865	FrequencyData &Freqs = BFI.Freqs[M.Index];
				866	Float NewFreq = Freqs.Floating * HeadData.Floating;
				867	DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating
				868	<< " => " << NewFreq << "\n");
				869	Freqs.Floating = NewFreq;
				870	}
				871	}
				872
				873	void BlockFrequencyInfoImplBase::finalizeMetrics() {
				874	// Set initial frequencies from loop-local masses.
				875	for (size_t Index = 0; Index < Working.size(); ++Index)
				876	Freqs[Index].Floating = Working[Index].Mass.toFloat();
				877
				878	// Unwrap loop packages in reverse post-order, tracking min and max
				879	// frequencies.
				880	auto Min = Float::getLargest();
				881	auto Max = Float::getZero();
				882	for (size_t Index = 0; Index < Working.size(); ++Index) {
				883	if (Working[Index].isLoopHeader())
				884	unwrapLoopPackage(*this, BlockNode(Index));
				885
				886	// Update max scale.
				887	Min = std::min(Min, Freqs[Index].Floating);
				888	Max = std::max(Max, Freqs[Index].Floating);
				889	}
				890
				891	// Convert to integers.
				892	convertFloatingToInteger(*this, Min, Max);
				893
				894	// Clean up data structures.
				895	cleanup(*this);
				896
				897	// Print out the final stats.
				898	DEBUG(dump());
				899	}
				900
				901	BlockFrequency
				902	BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
				903	if (!Node.isValid())
				904	return 0;
				905	return Freqs[Node.Index].Integer;
				906	}
				907	Float
				908	BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
				909	if (!Node.isValid())
				910	return Float::getZero();
				911	return Freqs[Node.Index].Floating;
				912	}
				913
				914	std::string
				915	BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
				916	return std::string();
				917	}
				918
				919	raw_ostream &
				920	BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
				921	const BlockNode &Node) const {
				922	return OS << getFloatingBlockFreq(Node);
				923	}
				924
				925	raw_ostream &
				926	BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
				927	const BlockFrequency &Freq) const {
				928	Float Block(Freq.getFrequency(), 0);
				929	Float Entry(getEntryFreq(), 0);
				930
				931	return OS << Block / Entry;
				932	}