Blame - lib/Target/R600/AMDILPeepholeOptimizer.cpp - platform/external/llvm

blob: 4a748b8e9c884d6d3682ebe38b31c20b77258944 [file] [log] [blame]

Tom Stellard	f98f2ce	2012-12-11 21:25:42 +0000	[diff] [blame^]	1	//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	/// \file
				9	//==-----------------------------------------------------------------------===//
				10
				11	#define DEBUG_TYPE "PeepholeOpt"
				12	#ifdef DEBUG
				13	#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
				14	#else
				15	#define DEBUGME 0
				16	#endif
				17
				18	#include "AMDILDevices.h"
				19	#include "AMDGPUInstrInfo.h"
				20	#include "llvm/ADT/Statistic.h"
				21	#include "llvm/ADT/StringExtras.h"
				22	#include "llvm/ADT/StringRef.h"
				23	#include "llvm/ADT/Twine.h"
				24	#include "llvm/Constants.h"
				25	#include "llvm/CodeGen/MachineFunction.h"
				26	#include "llvm/CodeGen/MachineFunctionAnalysis.h"
				27	#include "llvm/Function.h"
				28	#include "llvm/Instructions.h"
				29	#include "llvm/Module.h"
				30	#include "llvm/Support/Debug.h"
				31	#include "llvm/Support/MathExtras.h"
				32
				33	#include <sstream>
				34
				35	#if 0
				36	STATISTIC(PointerAssignments, "Number of dynamic pointer "
				37	"assigments discovered");
				38	STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
				39	#endif
				40
				41	using namespace llvm;
				42	// The Peephole optimization pass is used to do simple last minute optimizations
				43	// that are required for correct code or to remove redundant functions
				44	namespace {
				45
				46	class OpaqueType;
				47
				48	class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
				49	public:
				50	TargetMachine &TM;
				51	static char ID;
				52	AMDGPUPeepholeOpt(TargetMachine &tm);
				53	~AMDGPUPeepholeOpt();
				54	const char *getPassName() const;
				55	bool runOnFunction(Function &F);
				56	bool doInitialization(Module &M);
				57	bool doFinalization(Module &M);
				58	void getAnalysisUsage(AnalysisUsage &AU) const;
				59	protected:
				60	private:
				61	// Function to initiate all of the instruction level optimizations.
				62	bool instLevelOptimizations(BasicBlock::iterator *inst);
				63	// Quick check to see if we need to dump all of the pointers into the
				64	// arena. If this is correct, then we set all pointers to exist in arena. This
				65	// is a workaround for aliasing of pointers in a struct/union.
				66	bool dumpAllIntoArena(Function &F);
				67	// Because I don't want to invalidate any pointers while in the
				68	// safeNestedForEachFunction. I push atomic conversions to a vector and handle
				69	// it later. This function does the conversions if required.
				70	void doAtomicConversionIfNeeded(Function &F);
				71	// Because __amdil_is_constant cannot be properly evaluated if
				72	// optimizations are disabled, the call's are placed in a vector
				73	// and evaluated after the __amdil_image* functions are evaluated
				74	// which should allow the __amdil_is_constant function to be
				75	// evaluated correctly.
				76	void doIsConstCallConversionIfNeeded();
				77	bool mChanged;
				78	bool mDebug;
				79	bool mConvertAtomics;
				80	CodeGenOpt::Level optLevel;
				81	// Run a series of tests to see if we can optimize a CALL instruction.
				82	bool optimizeCallInst(BasicBlock::iterator *bbb);
				83	// A peephole optimization to optimize bit extract sequences.
				84	bool optimizeBitExtract(Instruction *inst);
				85	// A peephole optimization to optimize bit insert sequences.
				86	bool optimizeBitInsert(Instruction *inst);
				87	bool setupBitInsert(Instruction *base,
				88	Instruction *&src,
				89	Constant *&mask,
				90	Constant *&shift);
				91	// Expand the bit field insert instruction on versions of OpenCL that
				92	// don't support it.
				93	bool expandBFI(CallInst *CI);
				94	// Expand the bit field mask instruction on version of OpenCL that
				95	// don't support it.
				96	bool expandBFM(CallInst *CI);
				97	// On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
				98	// this case we need to expand them. These functions check for 24bit functions
				99	// and then expand.
				100	bool isSigned24BitOps(CallInst *CI);
				101	void expandSigned24BitOps(CallInst *CI);
				102	// One optimization that can occur is that if the required workgroup size is
				103	// specified then the result of get_local_size is known at compile time and
				104	// can be returned accordingly.
				105	bool isRWGLocalOpt(CallInst *CI);
				106	// On northern island cards, the division is slightly less accurate than on
				107	// previous generations, so we need to utilize a more accurate division. So we
				108	// can translate the accurate divide to a normal divide on all other cards.
				109	bool convertAccurateDivide(CallInst *CI);
				110	void expandAccurateDivide(CallInst *CI);
				111	// If the alignment is set incorrectly, it can produce really inefficient
				112	// code. This checks for this scenario and fixes it if possible.
				113	bool correctMisalignedMemOp(Instruction *inst);
				114
				115	// If we are in no opt mode, then we need to make sure that
				116	// local samplers are properly propagated as constant propagation
				117	// doesn't occur and we need to know the value of kernel defined
				118	// samplers at compile time.
				119	bool propagateSamplerInst(CallInst *CI);
				120
				121	// Helper functions
				122
				123	// Group of functions that recursively calculate the size of a structure based
				124	// on it's sub-types.
				125	size_t getTypeSize(Type * const T, bool dereferencePtr = false);
				126	size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
				127	size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
				128	size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
				129	size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
				130	size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
				131	size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
				132	size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
				133
				134	LLVMContext *mCTX;
				135	Function *mF;
				136	const AMDGPUSubtarget *mSTM;
				137	SmallVector< std::pair<CallInst , Function >, 16> atomicFuncs;
				138	SmallVector<CallInst *, 16> isConstVec;
				139	}; // class AMDGPUPeepholeOpt
				140	char AMDGPUPeepholeOpt::ID = 0;
				141
				142	// A template function that has two levels of looping before calling the
				143	// function with a pointer to the current iterator.
				144	template<class InputIterator, class SecondIterator, class Function>
				145	Function safeNestedForEach(InputIterator First, InputIterator Last,
				146	SecondIterator S, Function F) {
				147	for ( ; First != Last; ++First) {
				148	SecondIterator sf, sl;
				149	for (sf = First->begin(), sl = First->end();
				150	sf != sl; ) {
				151	if (!F(&sf)) {
				152	++sf;
				153	}
				154	}
				155	}
				156	return F;
				157	}
				158
				159	} // anonymous namespace
				160
				161	namespace llvm {
				162	FunctionPass *
				163	createAMDGPUPeepholeOpt(TargetMachine &tm) {
				164	return new AMDGPUPeepholeOpt(tm);
				165	}
				166	} // llvm namespace
				167
				168	AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
				169	: FunctionPass(ID), TM(tm) {
				170	mDebug = DEBUGME;
				171	optLevel = TM.getOptLevel();
				172
				173	}
				174
				175	AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
				176	}
				177
				178	const char *
				179	AMDGPUPeepholeOpt::getPassName() const {
				180	return "AMDGPU PeepHole Optimization Pass";
				181	}
				182
				183	bool
				184	containsPointerType(Type *Ty) {
				185	if (!Ty) {
				186	return false;
				187	}
				188	switch(Ty->getTypeID()) {
				189	default:
				190	return false;
				191	case Type::StructTyID: {
				192	const StructType *ST = dyn_cast<StructType>(Ty);
				193	for (StructType::element_iterator stb = ST->element_begin(),
				194	ste = ST->element_end(); stb != ste; ++stb) {
				195	if (!containsPointerType(*stb)) {
				196	continue;
				197	}
				198	return true;
				199	}
				200	break;
				201	}
				202	case Type::VectorTyID:
				203	case Type::ArrayTyID:
				204	return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
				205	case Type::PointerTyID:
				206	return true;
				207	};
				208	return false;
				209	}
				210
				211	bool
				212	AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
				213	bool dumpAll = false;
				214	for (Function::const_arg_iterator cab = F.arg_begin(),
				215	cae = F.arg_end(); cab != cae; ++cab) {
				216	const Argument *arg = cab;
				217	const PointerType *PT = dyn_cast<PointerType>(arg->getType());
				218	if (!PT) {
				219	continue;
				220	}
				221	Type *DereferencedType = PT->getElementType();
				222	if (!dyn_cast<StructType>(DereferencedType)
				223	) {
				224	continue;
				225	}
				226	if (!containsPointerType(DereferencedType)) {
				227	continue;
				228	}
				229	// FIXME: Because a pointer inside of a struct/union may be aliased to
				230	// another pointer we need to take the conservative approach and place all
				231	// pointers into the arena until more advanced detection is implemented.
				232	dumpAll = true;
				233	}
				234	return dumpAll;
				235	}
				236	void
				237	AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
				238	if (isConstVec.empty()) {
				239	return;
				240	}
				241	for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
				242	CallInst *CI = isConstVec[x];
				243	Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
				244	Type aType = Type::getInt32Ty(mCTX);
				245	Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
				246	: ConstantInt::get(aType, 0);
				247	CI->replaceAllUsesWith(Val);
				248	CI->eraseFromParent();
				249	}
				250	isConstVec.clear();
				251	}
				252	void
				253	AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
				254	// Don't do anything if we don't have any atomic operations.
				255	if (atomicFuncs.empty()) {
				256	return;
				257	}
				258	// Change the function name for the atomic if it is required
				259	uint32_t size = atomicFuncs.size();
				260	for (uint32_t x = 0; x < size; ++x) {
				261	atomicFuncs[x].first->setOperand(
				262	atomicFuncs[x].first->getNumOperands()-1,
				263	atomicFuncs[x].second);
				264
				265	}
				266	mChanged = true;
				267	if (mConvertAtomics) {
				268	return;
				269	}
				270	}
				271
				272	bool
				273	AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
				274	mChanged = false;
				275	mF = &MF;
				276	mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
				277	if (mDebug) {
				278	MF.dump();
				279	}
				280	mCTX = &MF.getType()->getContext();
				281	mConvertAtomics = true;
				282	safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
				283	std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
				284	this));
				285
				286	doAtomicConversionIfNeeded(MF);
				287	doIsConstCallConversionIfNeeded();
				288
				289	if (mDebug) {
				290	MF.dump();
				291	}
				292	return mChanged;
				293	}
				294
				295	bool
				296	AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
				297	Instruction inst = (bbb);
				298	CallInst *CI = dyn_cast<CallInst>(inst);
				299	if (!CI) {
				300	return false;
				301	}
				302	if (isSigned24BitOps(CI)) {
				303	expandSigned24BitOps(CI);
				304	++(*bbb);
				305	CI->eraseFromParent();
				306	return true;
				307	}
				308	if (propagateSamplerInst(CI)) {
				309	return false;
				310	}
				311	if (expandBFI(CI) \|\| expandBFM(CI)) {
				312	++(*bbb);
				313	CI->eraseFromParent();
				314	return true;
				315	}
				316	if (convertAccurateDivide(CI)) {
				317	expandAccurateDivide(CI);
				318	++(*bbb);
				319	CI->eraseFromParent();
				320	return true;
				321	}
				322
				323	StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
				324	if (calleeName.startswith("__amdil_is_constant")) {
				325	// If we do not have optimizations, then this
				326	// cannot be properly evaluated, so we add the
				327	// call instruction to a vector and process
				328	// them at the end of processing after the
				329	// samplers have been correctly handled.
				330	if (optLevel == CodeGenOpt::None) {
				331	isConstVec.push_back(CI);
				332	return false;
				333	} else {
				334	Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
				335	Type aType = Type::getInt32Ty(mCTX);
				336	Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
				337	: ConstantInt::get(aType, 0);
				338	CI->replaceAllUsesWith(Val);
				339	++(*bbb);
				340	CI->eraseFromParent();
				341	return true;
				342	}
				343	}
				344
				345	if (calleeName.equals("__amdil_is_asic_id_i32")) {
				346	ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
				347	Type aType = Type::getInt32Ty(mCTX);
				348	Value *Val = CV;
				349	if (Val) {
				350	Val = ConstantInt::get(aType,
				351	mSTM->device()->getDeviceFlag() & CV->getZExtValue());
				352	} else {
				353	Val = ConstantInt::get(aType, 0);
				354	}
				355	CI->replaceAllUsesWith(Val);
				356	++(*bbb);
				357	CI->eraseFromParent();
				358	return true;
				359	}
				360	Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
				361	if (!F) {
				362	return false;
				363	}
				364	if (F->getName().startswith("__atom") && !CI->getNumUses()
				365	&& F->getName().find("_xchg") == StringRef::npos) {
				366	std::string buffer(F->getName().str() + "_noret");
				367	F = dyn_cast<Function>(
				368	F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
				369	atomicFuncs.push_back(std::make_pair <CallInst, Function>(CI, F));
				370	}
				371
				372	if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
				373	&& !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
				374	return false;
				375	}
				376	if (!mConvertAtomics) {
				377	return false;
				378	}
				379	StringRef name = F->getName();
				380	if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
				381	mConvertAtomics = false;
				382	}
				383	return false;
				384	}
				385
				386	bool
				387	AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
				388	Instruction *&src,
				389	Constant *&mask,
				390	Constant *&shift) {
				391	if (!base) {
				392	if (mDebug) {
				393	dbgs() << "Null pointer passed into function.\n";
				394	}
				395	return false;
				396	}
				397	bool andOp = false;
				398	if (base->getOpcode() == Instruction::Shl) {
				399	shift = dyn_cast<Constant>(base->getOperand(1));
				400	} else if (base->getOpcode() == Instruction::And) {
				401	mask = dyn_cast<Constant>(base->getOperand(1));
				402	andOp = true;
				403	} else {
				404	if (mDebug) {
				405	dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
				406	}
				407	// If the base is neither a Shl or a And, we don't fit any of the patterns above.
				408	return false;
				409	}
				410	src = dyn_cast<Instruction>(base->getOperand(0));
				411	if (!src) {
				412	if (mDebug) {
				413	dbgs() << "Failed setup since the base operand is not an instruction!\n";
				414	}
				415	return false;
				416	}
				417	// If we find an 'and' operation, then we don't need to
				418	// find the next operation as we already know the
				419	// bits that are valid at this point.
				420	if (andOp) {
				421	return true;
				422	}
				423	if (src->getOpcode() == Instruction::Shl && !shift) {
				424	shift = dyn_cast<Constant>(src->getOperand(1));
				425	src = dyn_cast<Instruction>(src->getOperand(0));
				426	} else if (src->getOpcode() == Instruction::And && !mask) {
				427	mask = dyn_cast<Constant>(src->getOperand(1));
				428	}
				429	if (!mask && !shift) {
				430	if (mDebug) {
				431	dbgs() << "Failed setup since both mask and shift are NULL!\n";
				432	}
				433	// Did not find a constant mask or a shift.
				434	return false;
				435	}
				436	return true;
				437	}
				438	bool
				439	AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
				440	if (!inst) {
				441	return false;
				442	}
				443	if (!inst->isBinaryOp()) {
				444	return false;
				445	}
				446	if (inst->getOpcode() != Instruction::Or) {
				447	return false;
				448	}
				449	if (optLevel == CodeGenOpt::None) {
				450	return false;
				451	}
				452	// We want to do an optimization on a sequence of ops that in the end equals a
				453	// single ISA instruction.
				454	// The base pattern for this optimization is - ((A & B) << C) \| ((D & E) << F)
				455	// Some simplified versions of this pattern are as follows:
				456	// (A & B) \| (D & E) when B & E == 0 && C == 0 && F == 0
				457	// ((A & B) << C) \| (D & E) when B ^ E == 0 && (1 << C) >= E
				458	// (A & B) \| ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
				459	// (A & B) \| (D << F) when (1 << F) >= B
				460	// (A << C) \| (D & E) when (1 << C) >= E
				461	if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
				462	// The HD4XXX hardware doesn't support the ubit_insert instruction.
				463	return false;
				464	}
				465	Type *aType = inst->getType();
				466	bool isVector = aType->isVectorTy();
				467	int numEle = 1;
				468	// This optimization only works on 32bit integers.
				469	if (aType->getScalarType()
				470	!= Type::getInt32Ty(inst->getContext())) {
				471	return false;
				472	}
				473	if (isVector) {
				474	const VectorType *VT = dyn_cast<VectorType>(aType);
				475	numEle = VT->getNumElements();
				476	// We currently cannot support more than 4 elements in a intrinsic and we
				477	// cannot support Vec3 types.
				478	if (numEle > 4 \|\| numEle == 3) {
				479	return false;
				480	}
				481	}
				482	// TODO: Handle vectors.
				483	if (isVector) {
				484	if (mDebug) {
				485	dbgs() << "!!! Vectors are not supported yet!\n";
				486	}
				487	return false;
				488	}
				489	Instruction LHSSrc = NULL, RHSSrc = NULL;
				490	Constant LHSMask = NULL, RHSMask = NULL;
				491	Constant LHSShift = NULL, RHSShift = NULL;
				492	Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
				493	Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
				494	if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
				495	if (mDebug) {
				496	dbgs() << "Found an OR Operation that failed setup!\n";
				497	inst->dump();
				498	if (LHS) { LHS->dump(); }
				499	if (LHSSrc) { LHSSrc->dump(); }
				500	if (LHSMask) { LHSMask->dump(); }
				501	if (LHSShift) { LHSShift->dump(); }
				502	}
				503	// There was an issue with the setup for BitInsert.
				504	return false;
				505	}
				506	if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
				507	if (mDebug) {
				508	dbgs() << "Found an OR Operation that failed setup!\n";
				509	inst->dump();
				510	if (RHS) { RHS->dump(); }
				511	if (RHSSrc) { RHSSrc->dump(); }
				512	if (RHSMask) { RHSMask->dump(); }
				513	if (RHSShift) { RHSShift->dump(); }
				514	}
				515	// There was an issue with the setup for BitInsert.
				516	return false;
				517	}
				518	if (mDebug) {
				519	dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
				520	dbgs() << "Op: "; inst->dump();
				521	dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
				522	dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
				523	dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
				524	dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
				525	dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
				526	dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
				527	dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
				528	dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
				529	}
				530	Constant *offset = NULL;
				531	Constant *width = NULL;
				532	uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
				533	uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
				534	uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
				535	uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
				536	lhsMaskVal = (LHSMask
				537	? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
				538	rhsMaskVal = (RHSMask
				539	? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
				540	lhsShiftVal = (LHSShift
				541	? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
				542	rhsShiftVal = (RHSShift
				543	? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
				544	lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
				545	rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
				546	lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
				547	rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
				548	// TODO: Handle the case of A & B \| D & ~B(i.e. inverted masks).
				549	if ((lhsMaskVal \|\| rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
				550	return false;
				551	}
				552	if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
				553	offset = ConstantInt::get(aType, lhsMaskOffset, false);
				554	width = ConstantInt::get(aType, lhsMaskWidth, false);
				555	RHSSrc = RHS;
				556	if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
				557	return false;
				558	}
				559	if (!LHSShift) {
				560	LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
				561	"MaskShr", LHS);
				562	} else if (lhsShiftVal != lhsMaskOffset) {
				563	LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
				564	"MaskShr", LHS);
				565	}
				566	if (mDebug) {
				567	dbgs() << "Optimizing LHS!\n";
				568	}
				569	} else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
				570	offset = ConstantInt::get(aType, rhsMaskOffset, false);
				571	width = ConstantInt::get(aType, rhsMaskWidth, false);
				572	LHSSrc = RHSSrc;
				573	RHSSrc = LHS;
				574	if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
				575	return false;
				576	}
				577	if (!RHSShift) {
				578	LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
				579	"MaskShr", RHS);
				580	} else if (rhsShiftVal != rhsMaskOffset) {
				581	LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
				582	"MaskShr", RHS);
				583	}
				584	if (mDebug) {
				585	dbgs() << "Optimizing RHS!\n";
				586	}
				587	} else {
				588	if (mDebug) {
				589	dbgs() << "Failed constraint 3!\n";
				590	}
				591	return false;
				592	}
				593	if (mDebug) {
				594	dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
				595	dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
				596	dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
				597	dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
				598	}
				599	if (!offset \|\| !width) {
				600	if (mDebug) {
				601	dbgs() << "Either width or offset are NULL, failed detection!\n";
				602	}
				603	return false;
				604	}
				605	// Lets create the function signature.
				606	std::vector<Type *> callTypes;
				607	callTypes.push_back(aType);
				608	callTypes.push_back(aType);
				609	callTypes.push_back(aType);
				610	callTypes.push_back(aType);
				611	FunctionType *funcType = FunctionType::get(aType, callTypes, false);
				612	std::string name = "__amdil_ubit_insert";
				613	if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
				614	Function *Func =
				615	dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
				616	getOrInsertFunction(llvm::StringRef(name), funcType));
				617	Value *Operands[4] = {
				618	width,
				619	offset,
				620	LHSSrc,
				621	RHSSrc
				622	};
				623	CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
				624	if (mDebug) {
				625	dbgs() << "Old Inst: ";
				626	inst->dump();
				627	dbgs() << "New Inst: ";
				628	CI->dump();
				629	dbgs() << "\n\n";
				630	}
				631	CI->insertBefore(inst);
				632	inst->replaceAllUsesWith(CI);
				633	return true;
				634	}
				635
				636	bool
				637	AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
				638	if (!inst) {
				639	return false;
				640	}
				641	if (!inst->isBinaryOp()) {
				642	return false;
				643	}
				644	if (inst->getOpcode() != Instruction::And) {
				645	return false;
				646	}
				647	if (optLevel == CodeGenOpt::None) {
				648	return false;
				649	}
				650	// We want to do some simple optimizations on Shift right/And patterns. The
				651	// basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
				652	// value smaller than 32 and C is a mask. If C is a constant value, then the
				653	// following transformation can occur. For signed integers, it turns into the
				654	// function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
				655	// integers, it turns into the function call dst =
				656	// __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u\|i]bit_extract
				657	// can be found in Section 7.9 of the ATI IL spec of the stream SDK for
				658	// Evergreen hardware.
				659	if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
				660	// This does not work on HD4XXX hardware.
				661	return false;
				662	}
				663	Type *aType = inst->getType();
				664	bool isVector = aType->isVectorTy();
				665
				666	// XXX Support vector types
				667	if (isVector) {
				668	return false;
				669	}
				670	int numEle = 1;
				671	// This only works on 32bit integers
				672	if (aType->getScalarType()
				673	!= Type::getInt32Ty(inst->getContext())) {
				674	return false;
				675	}
				676	if (isVector) {
				677	const VectorType *VT = dyn_cast<VectorType>(aType);
				678	numEle = VT->getNumElements();
				679	// We currently cannot support more than 4 elements in a intrinsic and we
				680	// cannot support Vec3 types.
				681	if (numEle > 4 \|\| numEle == 3) {
				682	return false;
				683	}
				684	}
				685	BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
				686	// If the first operand is not a shift instruction, then we can return as it
				687	// doesn't match this pattern.
				688	if (!ShiftInst \|\| !ShiftInst->isShift()) {
				689	return false;
				690	}
				691	// If we are a shift left, then we need don't match this pattern.
				692	if (ShiftInst->getOpcode() == Instruction::Shl) {
				693	return false;
				694	}
				695	bool isSigned = ShiftInst->isArithmeticShift();
				696	Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
				697	Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
				698	// Lets make sure that the shift value and the and mask are constant integers.
				699	if (!AndMask \|\| !ShrVal) {
				700	return false;
				701	}
				702	Constant *newMaskConst;
				703	Constant *shiftValConst;
				704	if (isVector) {
				705	// Handle the vector case
				706	std::vector<Constant *> maskVals;
				707	std::vector<Constant *> shiftVals;
				708	ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
				709	ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
				710	Type *scalarType = AndMaskVec->getType()->getScalarType();
				711	assert(AndMaskVec->getNumOperands() ==
				712	ShrValVec->getNumOperands() && "cannot have a "
				713	"combination where the number of elements to a "
				714	"shift and an and are different!");
				715	for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
				716	ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
				717	ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
				718	if (!AndCI \|\| !ShiftIC) {
				719	return false;
				720	}
				721	uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
				722	if (!isMask_32(maskVal)) {
				723	return false;
				724	}
				725	maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
				726	uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
				727	// If the mask or shiftval is greater than the bitcount, then break out.
				728	if (maskVal >= 32 \|\| shiftVal >= 32) {
				729	return false;
				730	}
				731	// If the mask val is greater than the the number of original bits left
				732	// then this optimization is invalid.
				733	if (maskVal > (32 - shiftVal)) {
				734	return false;
				735	}
				736	maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
				737	shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
				738	}
				739	newMaskConst = ConstantVector::get(maskVals);
				740	shiftValConst = ConstantVector::get(shiftVals);
				741	} else {
				742	// Handle the scalar case
				743	uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
				744	// This must be a mask value where all lower bits are set to 1 and then any
				745	// bit higher is set to 0.
				746	if (!isMask_32(maskVal)) {
				747	return false;
				748	}
				749	maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
				750	// Count the number of bits set in the mask, this is the width of the
				751	// resulting bit set that is extracted from the source value.
				752	uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
				753	// If the mask or shift val is greater than the bitcount, then break out.
				754	if (maskVal >= 32 \|\| shiftVal >= 32) {
				755	return false;
				756	}
				757	// If the mask val is greater than the the number of original bits left then
				758	// this optimization is invalid.
				759	if (maskVal > (32 - shiftVal)) {
				760	return false;
				761	}
				762	newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
				763	shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
				764	}
				765	// Lets create the function signature.
				766	std::vector<Type *> callTypes;
				767	callTypes.push_back(aType);
				768	callTypes.push_back(aType);
				769	callTypes.push_back(aType);
				770	FunctionType *funcType = FunctionType::get(aType, callTypes, false);
				771	std::string name = "llvm.AMDGPU.bit.extract.u32";
				772	if (isVector) {
				773	name += ".v" + itostr(numEle) + "i32";
				774	} else {
				775	name += ".";
				776	}
				777	// Lets create the function.
				778	Function *Func =
				779	dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
				780	getOrInsertFunction(llvm::StringRef(name), funcType));
				781	Value *Operands[3] = {
				782	ShiftInst->getOperand(0),
				783	shiftValConst,
				784	newMaskConst
				785	};
				786	// Lets create the Call with the operands
				787	CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
				788	CI->setDoesNotAccessMemory();
				789	CI->insertBefore(inst);
				790	inst->replaceAllUsesWith(CI);
				791	return true;
				792	}
				793
				794	bool
				795	AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
				796	if (!CI) {
				797	return false;
				798	}
				799	Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
				800	if (!LHS->getName().startswith("__amdil_bfi")) {
				801	return false;
				802	}
				803	Type* type = CI->getOperand(0)->getType();
				804	Constant *negOneConst = NULL;
				805	if (type->isVectorTy()) {
				806	std::vector<Constant *> negOneVals;
				807	negOneConst = ConstantInt::get(CI->getContext(),
				808	APInt(32, StringRef("-1"), 10));
				809	for (size_t x = 0,
				810	y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
				811	negOneVals.push_back(negOneConst);
				812	}
				813	negOneConst = ConstantVector::get(negOneVals);
				814	} else {
				815	negOneConst = ConstantInt::get(CI->getContext(),
				816	APInt(32, StringRef("-1"), 10));
				817	}
				818	// __amdil_bfi => (A & B) \| (~A & C)
				819	BinaryOperator *lhs =
				820	BinaryOperator::Create(Instruction::And, CI->getOperand(0),
				821	CI->getOperand(1), "bfi_and", CI);
				822	BinaryOperator *rhs =
				823	BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
				824	"bfi_not", CI);
				825	rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
				826	"bfi_and", CI);
				827	lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
				828	CI->replaceAllUsesWith(lhs);
				829	return true;
				830	}
				831
				832	bool
				833	AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
				834	if (!CI) {
				835	return false;
				836	}
				837	Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
				838	if (!LHS->getName().startswith("__amdil_bfm")) {
				839	return false;
				840	}
				841	// __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
				842	Constant *newMaskConst = NULL;
				843	Constant *newShiftConst = NULL;
				844	Type* type = CI->getOperand(0)->getType();
				845	if (type->isVectorTy()) {
				846	std::vector<Constant*> newMaskVals, newShiftVals;
				847	newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
				848	newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
				849	for (size_t x = 0,
				850	y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
				851	newMaskVals.push_back(newMaskConst);
				852	newShiftVals.push_back(newShiftConst);
				853	}
				854	newMaskConst = ConstantVector::get(newMaskVals);
				855	newShiftConst = ConstantVector::get(newShiftVals);
				856	} else {
				857	newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
				858	newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
				859	}
				860	BinaryOperator *lhs =
				861	BinaryOperator::Create(Instruction::And, CI->getOperand(0),
				862	newMaskConst, "bfm_mask", CI);
				863	lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
				864	lhs, "bfm_shl", CI);
				865	lhs = BinaryOperator::Create(Instruction::Sub, lhs,
				866	newShiftConst, "bfm_sub", CI);
				867	BinaryOperator *rhs =
				868	BinaryOperator::Create(Instruction::And, CI->getOperand(1),
				869	newMaskConst, "bfm_mask", CI);
				870	lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
				871	CI->replaceAllUsesWith(lhs);
				872	return true;
				873	}
				874
				875	bool
				876	AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
				877	Instruction inst = (bbb);
				878	if (optimizeCallInst(bbb)) {
				879	return true;
				880	}
				881	if (optimizeBitExtract(inst)) {
				882	return false;
				883	}
				884	if (optimizeBitInsert(inst)) {
				885	return false;
				886	}
				887	if (correctMisalignedMemOp(inst)) {
				888	return false;
				889	}
				890	return false;
				891	}
				892	bool
				893	AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
				894	LoadInst *linst = dyn_cast<LoadInst>(inst);
				895	StoreInst *sinst = dyn_cast<StoreInst>(inst);
				896	unsigned alignment;
				897	Type* Ty = inst->getType();
				898	if (linst) {
				899	alignment = linst->getAlignment();
				900	Ty = inst->getType();
				901	} else if (sinst) {
				902	alignment = sinst->getAlignment();
				903	Ty = sinst->getValueOperand()->getType();
				904	} else {
				905	return false;
				906	}
				907	unsigned size = getTypeSize(Ty);
				908	if (size == alignment \|\| size < alignment) {
				909	return false;
				910	}
				911	if (!Ty->isStructTy()) {
				912	return false;
				913	}
				914	if (alignment < 4) {
				915	if (linst) {
				916	linst->setAlignment(0);
				917	return true;
				918	} else if (sinst) {
				919	sinst->setAlignment(0);
				920	return true;
				921	}
				922	}
				923	return false;
				924	}
				925	bool
				926	AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
				927	if (!CI) {
				928	return false;
				929	}
				930	Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
				931	std::string namePrefix = LHS->getName().substr(0, 14);
				932	if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
				933	&& namePrefix != "__amdil__imul24_high") {
				934	return false;
				935	}
				936	if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
				937	return false;
				938	}
				939	return true;
				940	}
				941
				942	void
				943	AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
				944	assert(isSigned24BitOps(CI) && "Must be a "
				945	"signed 24 bit operation to call this function!");
				946	Value *LHS = CI->getOperand(CI->getNumOperands()-1);
				947	// On 7XX and 8XX we do not have signed 24bit, so we need to
				948	// expand it to the following:
				949	// imul24 turns into 32bit imul
				950	// imad24 turns into 32bit imad
				951	// imul24_high turns into 32bit imulhigh
				952	if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
				953	Type *aType = CI->getOperand(0)->getType();
				954	bool isVector = aType->isVectorTy();
				955	int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
				956	std::vector<Type*> callTypes;
				957	callTypes.push_back(CI->getOperand(0)->getType());
				958	callTypes.push_back(CI->getOperand(1)->getType());
				959	callTypes.push_back(CI->getOperand(2)->getType());
				960	FunctionType *funcType =
				961	FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
				962	std::string name = "__amdil_imad";
				963	if (isVector) {
				964	name += "_v" + itostr(numEle) + "i32";
				965	} else {
				966	name += "_i32";
				967	}
				968	Function *Func = dyn_cast<Function>(
				969	CI->getParent()->getParent()->getParent()->
				970	getOrInsertFunction(llvm::StringRef(name), funcType));
				971	Value *Operands[3] = {
				972	CI->getOperand(0),
				973	CI->getOperand(1),
				974	CI->getOperand(2)
				975	};
				976	CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
				977	nCI->insertBefore(CI);
				978	CI->replaceAllUsesWith(nCI);
				979	} else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
				980	BinaryOperator *mulOp =
				981	BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
				982	CI->getOperand(1), "imul24", CI);
				983	CI->replaceAllUsesWith(mulOp);
				984	} else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
				985	Type *aType = CI->getOperand(0)->getType();
				986
				987	bool isVector = aType->isVectorTy();
				988	int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
				989	std::vector<Type*> callTypes;
				990	callTypes.push_back(CI->getOperand(0)->getType());
				991	callTypes.push_back(CI->getOperand(1)->getType());
				992	FunctionType *funcType =
				993	FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
				994	std::string name = "__amdil_imul_high";
				995	if (isVector) {
				996	name += "_v" + itostr(numEle) + "i32";
				997	} else {
				998	name += "_i32";
				999	}
				1000	Function *Func = dyn_cast<Function>(
				1001	CI->getParent()->getParent()->getParent()->
				1002	getOrInsertFunction(llvm::StringRef(name), funcType));
				1003	Value *Operands[2] = {
				1004	CI->getOperand(0),
				1005	CI->getOperand(1)
				1006	};
				1007	CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
				1008	nCI->insertBefore(CI);
				1009	CI->replaceAllUsesWith(nCI);
				1010	}
				1011	}
				1012
				1013	bool
				1014	AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
				1015	return (CI != NULL
				1016	&& CI->getOperand(CI->getNumOperands() - 1)->getName()
				1017	== "__amdil_get_local_size_int");
				1018	}
				1019
				1020	bool
				1021	AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
				1022	if (!CI) {
				1023	return false;
				1024	}
				1025	if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
				1026	&& (mSTM->getDeviceName() == "cayman")) {
				1027	return false;
				1028	}
				1029	return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
				1030	== "__amdil_improved_div";
				1031	}
				1032
				1033	void
				1034	AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
				1035	assert(convertAccurateDivide(CI)
				1036	&& "expanding accurate divide can only happen if it is expandable!");
				1037	BinaryOperator *divOp =
				1038	BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
				1039	CI->getOperand(1), "fdiv32", CI);
				1040	CI->replaceAllUsesWith(divOp);
				1041	}
				1042
				1043	bool
				1044	AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
				1045	if (optLevel != CodeGenOpt::None) {
				1046	return false;
				1047	}
				1048
				1049	if (!CI) {
				1050	return false;
				1051	}
				1052
				1053	unsigned funcNameIdx = 0;
				1054	funcNameIdx = CI->getNumOperands() - 1;
				1055	StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
				1056	if (calleeName != "__amdil_image2d_read_norm"
				1057	&& calleeName != "__amdil_image2d_read_unnorm"
				1058	&& calleeName != "__amdil_image3d_read_norm"
				1059	&& calleeName != "__amdil_image3d_read_unnorm") {
				1060	return false;
				1061	}
				1062
				1063	unsigned samplerIdx = 2;
				1064	samplerIdx = 1;
				1065	Value *sampler = CI->getOperand(samplerIdx);
				1066	LoadInst *lInst = dyn_cast<LoadInst>(sampler);
				1067	if (!lInst) {
				1068	return false;
				1069	}
				1070
				1071	if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
				1072	return false;
				1073	}
				1074
				1075	GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
				1076	// If we are loading from what is not a global value, then we
				1077	// fail and return.
				1078	if (!gv) {
				1079	return false;
				1080	}
				1081
				1082	// If we don't have an initializer or we have an initializer and
				1083	// the initializer is not a 32bit integer, we fail.
				1084	if (!gv->hasInitializer()
				1085	\|\| !gv->getInitializer()->getType()->isIntegerTy(32)) {
				1086	return false;
				1087	}
				1088
				1089	// Now that we have the global variable initializer, lets replace
				1090	// all uses of the load instruction with the samplerVal and
				1091	// reparse the __amdil_is_constant() function.
				1092	Constant *samplerVal = gv->getInitializer();
				1093	lInst->replaceAllUsesWith(samplerVal);
				1094	return true;
				1095	}
				1096
				1097	bool
				1098	AMDGPUPeepholeOpt::doInitialization(Module &M) {
				1099	return false;
				1100	}
				1101
				1102	bool
				1103	AMDGPUPeepholeOpt::doFinalization(Module &M) {
				1104	return false;
				1105	}
				1106
				1107	void
				1108	AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
				1109	AU.addRequired<MachineFunctionAnalysis>();
				1110	FunctionPass::getAnalysisUsage(AU);
				1111	AU.setPreservesAll();
				1112	}
				1113
				1114	size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
				1115	size_t size = 0;
				1116	if (!T) {
				1117	return size;
				1118	}
				1119	switch (T->getTypeID()) {
				1120	case Type::X86_FP80TyID:
				1121	case Type::FP128TyID:
				1122	case Type::PPC_FP128TyID:
				1123	case Type::LabelTyID:
				1124	assert(0 && "These types are not supported by this backend");
				1125	default:
				1126	case Type::FloatTyID:
				1127	case Type::DoubleTyID:
				1128	size = T->getPrimitiveSizeInBits() >> 3;
				1129	break;
				1130	case Type::PointerTyID:
				1131	size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
				1132	break;
				1133	case Type::IntegerTyID:
				1134	size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
				1135	break;
				1136	case Type::StructTyID:
				1137	size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
				1138	break;
				1139	case Type::ArrayTyID:
				1140	size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
				1141	break;
				1142	case Type::FunctionTyID:
				1143	size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
				1144	break;
				1145	case Type::VectorTyID:
				1146	size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
				1147	break;
				1148	};
				1149	return size;
				1150	}
				1151
				1152	size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
				1153	bool dereferencePtr) {
				1154	size_t size = 0;
				1155	if (!ST) {
				1156	return size;
				1157	}
				1158	Type *curType;
				1159	StructType::element_iterator eib;
				1160	StructType::element_iterator eie;
				1161	for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
				1162	curType = *eib;
				1163	size += getTypeSize(curType, dereferencePtr);
				1164	}
				1165	return size;
				1166	}
				1167
				1168	size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
				1169	bool dereferencePtr) {
				1170	return IT ? (IT->getBitWidth() >> 3) : 0;
				1171	}
				1172
				1173	size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
				1174	bool dereferencePtr) {
				1175	assert(0 && "Should not be able to calculate the size of an function type");
				1176	return 0;
				1177	}
				1178
				1179	size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
				1180	bool dereferencePtr) {
				1181	return (size_t)(AT ? (getTypeSize(AT->getElementType(),
				1182	dereferencePtr) * AT->getNumElements())
				1183	: 0);
				1184	}
				1185
				1186	size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
				1187	bool dereferencePtr) {
				1188	return VT ? (VT->getBitWidth() >> 3) : 0;
				1189	}
				1190
				1191	size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
				1192	bool dereferencePtr) {
				1193	if (!PT) {
				1194	return 0;
				1195	}
				1196	Type *CT = PT->getElementType();
				1197	if (CT->getTypeID() == Type::StructTyID &&
				1198	PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
				1199	return getTypeSize(dyn_cast<StructType>(CT));
				1200	} else if (dereferencePtr) {
				1201	size_t size = 0;
				1202	for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
				1203	size += getTypeSize(PT->getContainedType(x), dereferencePtr);
				1204	}
				1205	return size;
				1206	} else {
				1207	return 4;
				1208	}
				1209	}
				1210
				1211	size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
				1212	bool dereferencePtr) {
				1213	//assert(0 && "Should not be able to calculate the size of an opaque type");
				1214	return 4;
				1215	}