| /**************************************************************************** |
| * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file fetch_jit.cpp |
| * |
| * @brief Implementation of the fetch jitter |
| * |
| * Notes: |
| * |
| ******************************************************************************/ |
| #include "builder.h" |
| #include "jit_api.h" |
| #include "fetch_jit.h" |
| #include "gen_state_llvm.h" |
| #include <sstream> |
| #include <tuple> |
| |
| //#define FETCH_DUMP_VERTEX 1 |
| using namespace llvm; |
| using namespace SwrJit; |
| |
| bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); |
| |
| enum ConversionType |
| { |
| CONVERT_NONE, |
| CONVERT_NORMALIZED, |
| CONVERT_USCALED, |
| CONVERT_SSCALED, |
| CONVERT_SFIXED, |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// Interface to Jitting a fetch shader |
| ////////////////////////////////////////////////////////////////////////// |
| struct FetchJit : public Builder |
| { |
| FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; |
| |
| Function* Create(const FETCH_COMPILE_STATE& fetchState); |
| Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); |
| Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); |
| Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); |
| |
| // package up Shuffle*bpcGatherd args into a tuple for convenience |
| typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, |
| uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4], |
| const uint32_t(&)[4]> Shuffle8bpcArgs; |
| #if USE_SIMD16_SHADERS |
| void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2); |
| #else |
| void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); |
| #endif |
| |
| typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType, |
| uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs; |
| #if USE_SIMD16_SHADERS |
| void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2); |
| #else |
| void Shuffle16bpcGather(Shuffle16bpcArgs &args); |
| #endif |
| |
| void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); |
| #if USE_SIMD16_BUILDER |
| void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); |
| #endif |
| |
| #if USE_SIMD16_SHADERS |
| Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); |
| #else |
| Value* GenerateCompCtrlVector(const ComponentControl ctrl); |
| #endif |
| |
| void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); |
| #if USE_SIMD16_SHADERS |
| #define USE_SIMD16_GATHERS 0 |
| |
| #if USE_SIMD16_GATHERS |
| void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); |
| #else |
| void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2); |
| #endif |
| #else |
| void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); |
| #endif |
| |
| bool IsOddFormat(SWR_FORMAT format); |
| bool IsUniformFormat(SWR_FORMAT format); |
| void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); |
| void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); |
| void ConvertFormat(SWR_FORMAT format, Value *texels[4]); |
| |
| Value* mpFetchInfo; |
| }; |
| |
| Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) |
| { |
| std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); |
| fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); |
| |
| Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); |
| BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); |
| |
| fetch->getParent()->setModuleIdentifier(fetch->getName()); |
| |
| IRB()->SetInsertPoint(entry); |
| |
| auto argitr = fetch->arg_begin(); |
| |
| // Fetch shader arguments |
| mpFetchInfo = &*argitr; ++argitr; |
| mpFetchInfo->setName("fetchInfo"); |
| Value* pVtxOut = &*argitr; |
| pVtxOut->setName("vtxOutput"); |
| // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex |
| // index 0(just the pointer to the simdvertex structure |
| // index 1(which element of the simdvertex structure to offset to(in this case 0) |
| // so the indices being i32's doesn't matter |
| // TODO: generated this GEP with a VECTOR structure type so this makes sense |
| std::vector<Value*> vtxInputIndices(2, C(0)); |
| // GEP |
| pVtxOut = GEP(pVtxOut, C(0)); |
| #if USE_SIMD16_SHADERS |
| #if 0// USE_SIMD16_BUILDER |
| pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); |
| #else |
| pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); |
| #endif |
| #else |
| pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); |
| #endif |
| |
| // SWR_FETCH_CONTEXT::pStreams |
| Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); |
| streams->setName("pStreams"); |
| |
| // SWR_FETCH_CONTEXT::pIndices |
| Value* indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pIndices}); |
| indices->setName("pIndices"); |
| |
| // SWR_FETCH_CONTEXT::pLastIndex |
| Value* pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex}); |
| pLastIndex->setName("pLastIndex"); |
| |
| |
| Value* vIndices; |
| #if USE_SIMD16_SHADERS |
| Value* indices2; |
| Value* vIndices2; |
| #endif |
| switch(fetchState.indexType) |
| { |
| case R8_UINT: |
| indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); |
| #if USE_SIMD16_SHADERS |
| indices2 = GEP(indices, C(8)); |
| #endif |
| if(fetchState.bDisableIndexOOBCheck) |
| { |
| vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); |
| vIndices = Z_EXT(vIndices, mSimdInt32Ty); |
| #if USE_SIMD16_SHADERS |
| vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 }); |
| vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty); |
| #endif |
| } |
| else |
| { |
| pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); |
| vIndices = GetSimdValid8bitIndices(indices, pLastIndex); |
| #if USE_SIMD16_SHADERS |
| pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0)); |
| vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex); |
| #endif |
| } |
| break; |
| case R16_UINT: |
| indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); |
| #if USE_SIMD16_SHADERS |
| indices2 = GEP(indices, C(8)); |
| #endif |
| if(fetchState.bDisableIndexOOBCheck) |
| { |
| vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0}); |
| vIndices = Z_EXT(vIndices, mSimdInt32Ty); |
| #if USE_SIMD16_SHADERS |
| vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 }); |
| vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty); |
| #endif |
| } |
| else |
| { |
| pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); |
| vIndices = GetSimdValid16bitIndices(indices, pLastIndex); |
| #if USE_SIMD16_SHADERS |
| pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0)); |
| vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex); |
| #endif |
| } |
| break; |
| case R32_UINT: |
| #if USE_SIMD16_SHADERS |
| indices2 = GEP(indices, C(8)); |
| #endif |
| (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0}) |
| : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); |
| #if USE_SIMD16_SHADERS |
| (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 }) |
| : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex); |
| #endif |
| break; // incoming type is already 32bit int |
| default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break; |
| } |
| |
| if(fetchState.bForceSequentialAccessEnable) |
| { |
| Value* pOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 }); |
| |
| // VertexData buffers are accessed sequentially, the index is equal to the vertex number |
| vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); |
| vIndices = ADD(vIndices, pOffsets); |
| #if USE_SIMD16_SHADERS |
| vIndices2 = ADD(vIndices, VIMMED1(8)); |
| #endif |
| } |
| |
| Value* vVertexId = vIndices; |
| #if USE_SIMD16_SHADERS |
| Value* vVertexId2 = vIndices2; |
| #endif |
| if (fetchState.bVertexIDOffsetEnable) |
| { |
| // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct |
| Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); |
| Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex })); |
| vVertexId = ADD(vIndices, vBaseVertex); |
| vVertexId = ADD(vVertexId, vStartVertex); |
| #if USE_SIMD16_SHADERS |
| vVertexId2 = ADD(vIndices2, vBaseVertex); |
| vVertexId2 = ADD(vVertexId2, vStartVertex); |
| #endif |
| } |
| |
| // store out vertex IDs |
| STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })); |
| #if USE_SIMD16_SHADERS |
| STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })); |
| #endif |
| |
| // store out cut mask if enabled |
| if (fetchState.bEnableCutIndex) |
| { |
| Value* vCutIndex = VIMMED1(fetchState.cutIndex); |
| Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); |
| STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask })); |
| #if USE_SIMD16_SHADERS |
| Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex)); |
| STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 })); |
| #endif |
| } |
| |
| // Fetch attributes from memory and output to a simdvertex struct |
| // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use |
| #if USE_SIMD16_SHADERS |
| if (fetchState.bDisableVGATHER) |
| { |
| JitLoadVertices(fetchState, streams, vIndices, pVtxOut); |
| JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1))); |
| } |
| else |
| { |
| #if USE_SIMD16_GATHERS |
| JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false); |
| #else |
| JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false); |
| JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true); |
| #endif |
| } |
| #else |
| (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut) |
| : JitGatherVertices(fetchState, streams, vIndices, pVtxOut); |
| #endif |
| |
| RET_VOID(); |
| |
| JitManager::DumpToFile(fetch, "src"); |
| |
| #if defined(_DEBUG) |
| verifyFunction(*fetch); |
| #endif |
| |
| ::FunctionPassManager setupPasses(JM()->mpCurrentModule); |
| |
| ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) |
| setupPasses.add(createBreakCriticalEdgesPass()); |
| setupPasses.add(createCFGSimplificationPass()); |
| setupPasses.add(createEarlyCSEPass()); |
| setupPasses.add(createPromoteMemoryToRegisterPass()); |
| |
| setupPasses.run(*fetch); |
| |
| JitManager::DumpToFile(fetch, "se"); |
| |
| ::FunctionPassManager optPasses(JM()->mpCurrentModule); |
| |
| ///@todo Haven't touched these either. Need to remove some of these and add others. |
| optPasses.add(createCFGSimplificationPass()); |
| optPasses.add(createEarlyCSEPass()); |
| optPasses.add(createInstructionCombiningPass()); |
| optPasses.add(createInstructionSimplifierPass()); |
| optPasses.add(createConstantPropagationPass()); |
| optPasses.add(createSCCPPass()); |
| optPasses.add(createAggressiveDCEPass()); |
| |
| optPasses.run(*fetch); |
| optPasses.run(*fetch); |
| |
| JitManager::DumpToFile(fetch, "opt"); |
| |
| return fetch; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Loads attributes from memory using LOADs, shuffling the |
| /// components into SOA form. |
| /// *Note* currently does not support component control, |
| /// component packing, instancing |
| /// @param fetchState - info about attributes to be fetched from memory |
| /// @param streams - value pointer to the current vertex stream |
| /// @param vIndices - vector value of indices to load |
| /// @param pVtxOut - value pointer to output simdvertex struct |
| void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut) |
| { |
| // Zack shuffles; a variant of the Charleston. |
| |
| std::vector<Value*> vectors(16); |
| std::vector<Constant*> pMask(mVWidth); |
| for(uint32_t i = 0; i < mVWidth; ++i) |
| { |
| pMask[i] = (C(i < 4 ? i : 4)); |
| } |
| Constant* promoteMask = ConstantVector::get(pMask); |
| Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4)); |
| |
| Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); |
| Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); |
| Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); |
| Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); |
| curInstance->setName("curInstance"); |
| |
| for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt) |
| { |
| Value* elements[4] = {0}; |
| const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt]; |
| const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); |
| SWR_ASSERT((info.bpp != 0), "Unsupported format in JitLoadVertices."); |
| uint32_t numComponents = info.numComps; |
| uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. |
| |
| // load path doesn't support component packing |
| SWR_ASSERT(ied.ComponentPacking == ComponentEnable::XYZW, "Fetch load path doesn't support component packing."); |
| |
| vectors.clear(); |
| |
| if (fetchState.bInstanceIDOffsetEnable) |
| { |
| SWR_ASSERT((0), "TODO: Fill out more once driver sends this down"); |
| } |
| |
| Value *vCurIndices; |
| Value *startOffset; |
| if(ied.InstanceEnable) |
| { |
| Value* stepRate = C(ied.InstanceAdvancementState); |
| |
| // prevent a div by 0 for 0 step rate |
| Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); |
| stepRate = SELECT(isNonZeroStep, stepRate, C(1)); |
| |
| // calc the current offset into instanced data buffer |
| Value* calcInstance = UDIV(curInstance, stepRate); |
| |
| // if step rate is 0, every instance gets instance 0 |
| calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); |
| |
| vCurIndices = VBROADCAST(calcInstance); |
| |
| startOffset = startInstance; |
| } |
| else if (ied.InstanceStrideEnable) |
| { |
| SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); |
| } |
| else |
| { |
| // offset indices by baseVertex |
| vCurIndices = ADD(vIndices, vBaseVertex); |
| |
| startOffset = startVertex; |
| } |
| |
| // load SWR_VERTEX_BUFFER_STATE::pData |
| Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); |
| |
| // load SWR_VERTEX_BUFFER_STATE::pitch |
| Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); |
| stride = Z_EXT(stride, mInt64Ty); |
| |
| // load SWR_VERTEX_BUFFER_STATE::size |
| Value *size = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_size}); |
| size = Z_EXT(size, mInt64Ty); |
| |
| Value* startVertexOffset = MUL(Z_EXT(startOffset, mInt64Ty), stride); |
| |
| Value *minVertex = NULL; |
| Value *minVertexOffset = NULL; |
| if (fetchState.bPartialVertexBuffer) { |
| // fetch min index for low bounds checking |
| minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); |
| minVertex = LOAD(minVertex); |
| if (!fetchState.bDisableIndexOOBCheck) { |
| minVertexOffset = MUL(Z_EXT(minVertex, mInt64Ty), stride); |
| } |
| } |
| |
| // Load from the stream. |
| for(uint32_t lane = 0; lane < mVWidth; ++lane) |
| { |
| // Get index |
| Value* index = VEXTRACT(vCurIndices, C(lane)); |
| |
| if (fetchState.bPartialVertexBuffer) { |
| // clamp below minvertex |
| Value *isBelowMin = ICMP_SLT(index, minVertex); |
| index = SELECT(isBelowMin, minVertex, index); |
| } |
| |
| index = Z_EXT(index, mInt64Ty); |
| |
| Value* offset = MUL(index, stride); |
| offset = ADD(offset, C((int64_t)ied.AlignedByteOffset)); |
| offset = ADD(offset, startVertexOffset); |
| |
| if (!fetchState.bDisableIndexOOBCheck) { |
| // check for out of bound access, including partial OOB, and replace them with minVertex |
| Value *endOffset = ADD(offset, C((int64_t)info.Bpp)); |
| Value *oob = ICMP_ULE(endOffset, size); |
| if (fetchState.bPartialVertexBuffer) { |
| offset = SELECT(oob, offset, minVertexOffset); |
| } else { |
| offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0)); |
| } |
| } |
| |
| Value* pointer = GEP(stream, offset); |
| // We use a full-lane, but don't actually care. |
| Value* vptr = 0; |
| |
| // get a pointer to a 4 component attrib in default address space |
| switch(bpc) |
| { |
| case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break; |
| case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break; |
| case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break; |
| default: SWR_INVALID("Unsupported underlying bpp!"); |
| } |
| |
| // load 4 components of attribute |
| Value* vec = ALIGNED_LOAD(vptr, 1, false); |
| |
| // Convert To FP32 internally |
| switch(info.type[0]) |
| { |
| case SWR_TYPE_UNORM: |
| switch(bpc) |
| { |
| case 8: |
| vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0)))); |
| break; |
| case 16: |
| vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0)))); |
| break; |
| default: |
| SWR_INVALID("Unsupported underlying type!"); |
| break; |
| } |
| break; |
| case SWR_TYPE_SNORM: |
| switch(bpc) |
| { |
| case 8: |
| vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0)))); |
| break; |
| case 16: |
| vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0)))); |
| break; |
| default: |
| SWR_INVALID("Unsupported underlying type!"); |
| break; |
| } |
| break; |
| case SWR_TYPE_UINT: |
| // Zero extend uint32_t types. |
| switch(bpc) |
| { |
| case 8: |
| case 16: |
| vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4)); |
| vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); |
| break; |
| case 32: |
| break; // Pass through unchanged. |
| default: |
| SWR_INVALID("Unsupported underlying type!"); |
| break; |
| } |
| break; |
| case SWR_TYPE_SINT: |
| // Sign extend SINT types. |
| switch(bpc) |
| { |
| case 8: |
| case 16: |
| vec = S_EXT(vec, VectorType::get(mInt32Ty, 4)); |
| vec = BITCAST(vec, VectorType::get(mFP32Ty, 4)); |
| break; |
| case 32: |
| break; // Pass through unchanged. |
| default: |
| SWR_INVALID("Unsupported underlying type!"); |
| break; |
| } |
| break; |
| case SWR_TYPE_FLOAT: |
| switch(bpc) |
| { |
| case 32: |
| break; // Pass through unchanged. |
| default: |
| SWR_INVALID("Unsupported underlying type!"); |
| } |
| break; |
| case SWR_TYPE_USCALED: |
| vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| break; |
| case SWR_TYPE_SSCALED: |
| vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)); |
| break; |
| case SWR_TYPE_SFIXED: |
| vec = FMUL(SI_TO_FP(vec, VectorType::get(mFP32Ty, 4)), VBROADCAST(C(1/65536.0f))); |
| break; |
| case SWR_TYPE_UNKNOWN: |
| case SWR_TYPE_UNUSED: |
| SWR_INVALID("Unsupported type %d!", info.type[0]); |
| } |
| |
| // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4) |
| // uwvec: 4 x F32, undef value |
| Value* wvec = VSHUFFLE(vec, uwvec, promoteMask); |
| vectors.push_back(wvec); |
| } |
| |
| std::vector<Constant*> v01Mask(mVWidth); |
| std::vector<Constant*> v23Mask(mVWidth); |
| std::vector<Constant*> v02Mask(mVWidth); |
| std::vector<Constant*> v13Mask(mVWidth); |
| |
| // Concatenate the vectors together. |
| elements[0] = VUNDEF_F(); |
| elements[1] = VUNDEF_F(); |
| elements[2] = VUNDEF_F(); |
| elements[3] = VUNDEF_F(); |
| for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b) |
| { |
| v01Mask[4 * b + 0] = C(0 + 4 * b); |
| v01Mask[4 * b + 1] = C(1 + 4 * b); |
| v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); |
| v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth); |
| |
| v23Mask[4 * b + 0] = C(2 + 4 * b); |
| v23Mask[4 * b + 1] = C(3 + 4 * b); |
| v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth); |
| v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); |
| |
| v02Mask[4 * b + 0] = C(0 + 4 * b); |
| v02Mask[4 * b + 1] = C(2 + 4 * b); |
| v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); |
| v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth); |
| |
| v13Mask[4 * b + 0] = C(1 + 4 * b); |
| v13Mask[4 * b + 1] = C(3 + 4 * b); |
| v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth); |
| v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); |
| |
| std::vector<Constant*> iMask(mVWidth); |
| for(uint32_t i = 0; i < mVWidth; ++i) |
| { |
| if(((4 * b) <= i) && (i < (4 * (b + 1)))) |
| { |
| iMask[i] = C(i % 4 + mVWidth); |
| } |
| else |
| { |
| iMask[i] = C(i); |
| } |
| } |
| Constant* insertMask = ConstantVector::get(iMask); |
| elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask); |
| elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask); |
| elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask); |
| elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask); |
| } |
| |
| Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask)); |
| Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask)); |
| Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask)); |
| Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask)); |
| elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask)); |
| elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask)); |
| elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask)); |
| elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask)); |
| |
| switch(numComponents + 1) |
| { |
| case 1: elements[0] = VIMMED1(0.0f); |
| case 2: elements[1] = VIMMED1(0.0f); |
| case 3: elements[2] = VIMMED1(0.0f); |
| case 4: elements[3] = VIMMED1(1.0f); |
| } |
| |
| for(uint32_t c = 0; c < 4; ++c) |
| { |
| #if USE_SIMD16_SHADERS |
| Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP"); |
| #else |
| Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP"); |
| #endif |
| STORE(elements[c], dest); |
| } |
| } |
| } |
| |
| // returns true for odd formats that require special state.gather handling |
| bool FetchJit::IsOddFormat(SWR_FORMAT format) |
| { |
| const SWR_FORMAT_INFO& info = GetFormatInfo(format); |
| if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64) |
| { |
| return true; |
| } |
| return false; |
| } |
| |
| // format is uniform if all components are the same size and type |
| bool FetchJit::IsUniformFormat(SWR_FORMAT format) |
| { |
| const SWR_FORMAT_INFO& info = GetFormatInfo(format); |
| uint32_t bpc0 = info.bpc[0]; |
| uint32_t type0 = info.type[0]; |
| |
| for (uint32_t c = 1; c < info.numComps; ++c) |
| { |
| if (bpc0 != info.bpc[c] || type0 != info.type[c]) |
| { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // unpacks components based on format |
| // foreach component in the pixel |
| // mask off everything but this component |
| // shift component to LSB |
| void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]) |
| { |
| const SWR_FORMAT_INFO& info = GetFormatInfo(format); |
| |
| uint32_t bitOffset = 0; |
| for (uint32_t c = 0; c < info.numComps; ++c) |
| { |
| uint32_t swizzledIndex = info.swizzle[c]; |
| uint32_t compBits = info.bpc[c]; |
| uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; |
| Value* comp = AND(vInput, bitmask); |
| comp = LSHR(comp, bitOffset); |
| |
| result[swizzledIndex] = comp; |
| bitOffset += compBits; |
| } |
| } |
| |
| // gather for odd component size formats |
| // gather SIMD full pixels per lane then shift/mask to move each component to their |
| // own vector |
| void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4]) |
| { |
| const SWR_FORMAT_INFO &info = GetFormatInfo(format); |
| |
| // only works if pixel size is <= 32bits |
| SWR_ASSERT(info.bpp <= 32); |
| |
| Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1)); |
| |
| for (uint32_t comp = 0; comp < 4; ++comp) |
| { |
| pResult[comp] = VIMMED1((int)info.defaults[comp]); |
| } |
| |
| UnpackComponents(format, pGather, pResult); |
| |
| // cast to fp32 |
| pResult[0] = BITCAST(pResult[0], mSimdFP32Ty); |
| pResult[1] = BITCAST(pResult[1], mSimdFP32Ty); |
| pResult[2] = BITCAST(pResult[2], mSimdFP32Ty); |
| pResult[3] = BITCAST(pResult[3], mSimdFP32Ty); |
| } |
| |
| void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) |
| { |
| const SWR_FORMAT_INFO &info = GetFormatInfo(format); |
| |
| for (uint32_t c = 0; c < info.numComps; ++c) |
| { |
| uint32_t compIndex = info.swizzle[c]; |
| |
| // skip any conversion on UNUSED components |
| if (info.type[c] == SWR_TYPE_UNUSED) |
| { |
| continue; |
| } |
| |
| if (info.isNormalized[c]) |
| { |
| if (info.type[c] == SWR_TYPE_SNORM) |
| { |
| /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f. |
| |
| /// result = c * (1.0f / (2^(n-1) - 1); |
| uint32_t n = info.bpc[c]; |
| uint32_t pow2 = 1 << (n - 1); |
| float scale = 1.0f / (float)(pow2 - 1); |
| Value *vScale = VIMMED1(scale); |
| texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); |
| texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); |
| texels[compIndex] = FMUL(texels[compIndex], vScale); |
| } |
| else |
| { |
| SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); |
| |
| /// result = c * (1.0f / (2^n - 1)) |
| uint32_t n = info.bpc[c]; |
| uint32_t pow2 = 1 << n; |
| // special case 24bit unorm format, which requires a full divide to meet ULP requirement |
| if (n == 24) |
| { |
| float scale = (float)(pow2 - 1); |
| Value* vScale = VIMMED1(scale); |
| texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); |
| texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); |
| texels[compIndex] = FDIV(texels[compIndex], vScale); |
| } |
| else |
| { |
| float scale = 1.0f / (float)(pow2 - 1); |
| Value *vScale = VIMMED1(scale); |
| texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); |
| texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); |
| texels[compIndex] = FMUL(texels[compIndex], vScale); |
| } |
| } |
| continue; |
| } |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Loads attributes from memory using AVX2 GATHER(s) |
| /// @param fetchState - info about attributes to be fetched from memory |
| /// @param streams - value pointer to the current vertex stream |
| /// @param vIndices - vector value of indices to gather |
| /// @param pVtxOut - value pointer to output simdvertex struct |
| #if USE_SIMD16_SHADERS |
| #if USE_SIMD16_GATHERS |
| void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, |
| Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2) |
| #else |
| void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, |
| Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2) |
| #endif |
| #else |
| void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, |
| Value* streams, Value* vIndices, Value* pVtxOut) |
| #endif |
| { |
| uint32_t currentVertexElement = 0; |
| uint32_t outputElt = 0; |
| Value* vVertexElements[4]; |
| #if USE_SIMD16_GATHERS |
| Value* vVertexElements2[4]; |
| #endif |
| |
| Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); |
| Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); |
| Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); |
| Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); |
| curInstance->setName("curInstance"); |
| |
| for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) |
| { |
| const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; |
| |
| // skip element if all components are disabled |
| if (ied.ComponentPacking == ComponentEnable::NONE) |
| { |
| continue; |
| } |
| |
| const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format); |
| SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); |
| uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix. |
| |
| Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData}); |
| |
| // VGATHER* takes an *i8 src pointer |
| Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0)); |
| |
| Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); |
| Value *vStride = VBROADCAST(stride); |
| |
| // max vertex index that is fully in bounds |
| Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); |
| maxVertex = LOAD(maxVertex); |
| |
| Value *minVertex = NULL; |
| if (fetchState.bPartialVertexBuffer) |
| { |
| // min vertex index for low bounds OOB checking |
| minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); |
| minVertex = LOAD(minVertex); |
| } |
| |
| if (fetchState.bInstanceIDOffsetEnable) |
| { |
| // the InstanceID (curInstance) value is offset by StartInstanceLocation |
| curInstance = ADD(curInstance, startInstance); |
| } |
| |
| Value *vCurIndices; |
| #if USE_SIMD16_GATHERS |
| Value *vCurIndices2; |
| #endif |
| Value *startOffset; |
| Value *vInstanceStride = VIMMED1(0); |
| |
| if (ied.InstanceEnable) |
| { |
| Value* stepRate = C(ied.InstanceAdvancementState); |
| |
| // prevent a div by 0 for 0 step rate |
| Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); |
| stepRate = SELECT(isNonZeroStep, stepRate, C(1)); |
| |
| // calc the current offset into instanced data buffer |
| Value* calcInstance = UDIV(curInstance, stepRate); |
| |
| // if step rate is 0, every instance gets instance 0 |
| calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); |
| |
| vCurIndices = VBROADCAST(calcInstance); |
| #if USE_SIMD16_GATHERS |
| vCurIndices2 = VBROADCAST(calcInstance); |
| #endif |
| |
| startOffset = startInstance; |
| } |
| else if (ied.InstanceStrideEnable) |
| { |
| // grab the instance advancement state, determines stride in bytes from one instance to the next |
| Value* stepRate = C(ied.InstanceAdvancementState); |
| vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); |
| |
| // offset indices by baseVertex |
| vCurIndices = ADD(vIndices, vBaseVertex); |
| #if USE_SIMD16_GATHERS |
| vCurIndices2 = ADD(vIndices2, vBaseVertex); |
| #endif |
| |
| startOffset = startVertex; |
| SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); |
| } |
| else |
| { |
| // offset indices by baseVertex |
| vCurIndices = ADD(vIndices, vBaseVertex); |
| #if USE_SIMD16_GATHERS |
| vCurIndices2 = ADD(vIndices2, vBaseVertex); |
| #endif |
| |
| startOffset = startVertex; |
| } |
| |
| // All of the OOB calculations are in vertices, not VB offsets, to prevent having to |
| // do 64bit address offset calculations. |
| |
| // calculate byte offset to the start of the VB |
| Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); |
| pStreamBase = GEP(pStreamBase, baseOffset); |
| |
| // if we have a start offset, subtract from max vertex. Used for OOB check |
| maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); |
| Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0)); |
| // if we have a negative value, we're already OOB. clamp at 0. |
| maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); |
| |
| if (fetchState.bPartialVertexBuffer) |
| { |
| // similary for min vertex |
| minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); |
| Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0)); |
| minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); |
| } |
| |
| // Load the in bounds size of a partially valid vertex |
| Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); |
| partialInboundsSize = LOAD(partialInboundsSize); |
| Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); |
| Value* vBpp = VBROADCAST(C(info.Bpp)); |
| Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); |
| |
| // is the element is <= the partially valid size |
| Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); |
| |
| #if USE_SIMD16_GATHERS |
| // override cur indices with 0 if pitch is 0 |
| Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); |
| vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); |
| |
| // are vertices partially OOB? |
| Value* vMaxVertex = VBROADCAST(maxVertex); |
| Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); |
| Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex); |
| |
| // are vertices fully in bounds? |
| Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); |
| Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex); |
| |
| Value *vGatherMask; |
| Value *vGatherMask2; |
| if (fetchState.bPartialVertexBuffer) |
| { |
| // are vertices below minVertex limit? |
| Value *vMinVertex = VBROADCAST(minVertex); |
| Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); |
| Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex); |
| |
| // only fetch lanes that pass both tests |
| vGatherMask = AND(vMaxGatherMask, vMinGatherMask); |
| vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2); |
| } |
| else |
| { |
| vGatherMask = vMaxGatherMask; |
| vGatherMask2 = vMaxGatherMask2; |
| } |
| |
| // blend in any partially OOB indices that have valid elements |
| vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); |
| vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2); |
| Value *pMask = vGatherMask; |
| Value *pMask2 = vGatherMask2; |
| vGatherMask = VMASK(vGatherMask); |
| vGatherMask2 = VMASK(vGatherMask2); |
| |
| // calculate the actual offsets into the VB |
| Value* vOffsets = MUL(vCurIndices, vStride); |
| vOffsets = ADD(vOffsets, vAlignmentOffsets); |
| |
| Value* vOffsets2 = MUL(vCurIndices2, vStride); |
| vOffsets2 = ADD(vOffsets2, vAlignmentOffsets); |
| |
| // if instance stride enable is: |
| // true - add product of the instanceID and advancement state to the offst into the VB |
| // false - value of vInstanceStride has been initialialized to zero |
| vOffsets = ADD(vOffsets, vInstanceStride); |
| vOffsets2 = ADD(vOffsets2, vInstanceStride); |
| |
| #else |
| // override cur indices with 0 if pitch is 0 |
| Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); |
| vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); |
| |
| // are vertices partially OOB? |
| Value* vMaxVertex = VBROADCAST(maxVertex); |
| Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); |
| |
| // are vertices fully in bounds? |
| Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); |
| |
| Value *vGatherMask; |
| if (fetchState.bPartialVertexBuffer) |
| { |
| // are vertices below minVertex limit? |
| Value *vMinVertex = VBROADCAST(minVertex); |
| Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); |
| |
| // only fetch lanes that pass both tests |
| vGatherMask = AND(vMaxGatherMask, vMinGatherMask); |
| } |
| else |
| { |
| vGatherMask = vMaxGatherMask; |
| } |
| |
| // blend in any partially OOB indices that have valid elements |
| vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); |
| Value* pMask = vGatherMask; |
| vGatherMask = VMASK(vGatherMask); |
| |
| // calculate the actual offsets into the VB |
| Value* vOffsets = MUL(vCurIndices, vStride); |
| vOffsets = ADD(vOffsets, vAlignmentOffsets); |
| |
| // if instance stride enable is: |
| // true - add product of the instanceID and advancement state to the offst into the VB |
| // false - value of vInstanceStride has been initialialized to zero |
| vOffsets = ADD(vOffsets, vInstanceStride); |
| |
| #endif |
| // Packing and component control |
| ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; |
| const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, |
| (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; |
| |
| // Special gather/conversion for formats without equal component sizes |
| if (IsOddFormat((SWR_FORMAT)ied.Format)) |
| { |
| #if USE_SIMD16_GATHERS |
| Value *pResults[4]; |
| Value *pResults2[4]; |
| CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); |
| CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2); |
| ConvertFormat((SWR_FORMAT)ied.Format, pResults); |
| ConvertFormat((SWR_FORMAT)ied.Format, pResults2); |
| |
| for (uint32_t c = 0; c < 4; c += 1) |
| { |
| if (isComponentEnabled(compMask, c)) |
| { |
| vVertexElements[currentVertexElement] = pResults[c]; |
| vVertexElements2[currentVertexElement] = pResults2[c]; |
| currentVertexElement++; |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); |
| StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); |
| |
| outputElt += 1; |
| |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| #else |
| Value* pResults[4]; |
| CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); |
| ConvertFormat((SWR_FORMAT)ied.Format, pResults); |
| |
| for (uint32_t c = 0; c < 4; ++c) |
| { |
| if (isComponentEnabled(compMask, c)) |
| { |
| vVertexElements[currentVertexElement++] = pResults[c]; |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| #endif |
| } |
| else if(info.type[0] == SWR_TYPE_FLOAT) |
| { |
| ///@todo: support 64 bit vb accesses |
| Value* gatherSrc = VIMMED1(0.0f); |
| #if USE_SIMD16_GATHERS |
| Value* gatherSrc2 = VIMMED1(0.0f); |
| #endif |
| |
| SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), |
| "Unsupported format for standard gather fetch."); |
| |
| // Gather components from memory to store in a simdvertex structure |
| switch (bpc) |
| { |
| case 16: |
| { |
| #if USE_SIMD16_GATHERS |
| Value* vGatherResult[2]; |
| Value* vGatherResult2[2]; |
| Value *vMask; |
| Value *vMask2; |
| |
| // if we have at least one component out of x or y to fetch |
| if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) |
| { |
| // save mask as it is zero'd out after each gather |
| vMask = vGatherMask; |
| vMask2 = vGatherMask2; |
| |
| vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); |
| // e.g. result of first 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy |
| // |
| } |
| |
| // if we have at least one component out of z or w to fetch |
| if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) |
| { |
| // offset base to the next components(zw) in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| vMask = vGatherMask; |
| vMask2 = vGatherMask2; |
| |
| vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); |
| // e.g. result of second 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw |
| // |
| } |
| |
| |
| // if we have at least one component to shuffle into place |
| if (compMask) |
| { |
| Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); |
| Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| Shuffle16bpcGather(args, false); // outputs to vVertexElements ref |
| Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref |
| } |
| #else |
| Value* vGatherResult[2]; |
| Value *vMask; |
| |
| // if we have at least one component out of x or y to fetch |
| if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ |
| // save mask as it is zero'd out after each gather |
| vMask = vGatherMask; |
| |
| vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| // e.g. result of first 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy |
| // |
| } |
| |
| // if we have at least one component out of z or w to fetch |
| if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ |
| // offset base to the next components(zw) in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| vMask = vGatherMask; |
| |
| vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| // e.g. result of second 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw |
| // |
| } |
| |
| // if we have at least one component to shuffle into place |
| if(compMask){ |
| Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| #if USE_SIMD16_SHADERS |
| Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref |
| #else |
| Shuffle16bpcGather(args); // outputs to vVertexElements ref |
| #endif |
| } |
| #endif |
| } |
| break; |
| case 32: |
| { |
| for (uint32_t i = 0; i < 4; i += 1) |
| { |
| #if USE_SIMD16_GATHERS |
| if (isComponentEnabled(compMask, i)) |
| { |
| // if we need to gather the component |
| if (compCtrl[i] == StoreSrc) |
| { |
| // save mask as it is zero'd out after each gather |
| Value *vMask = vGatherMask; |
| Value *vMask2 = vGatherMask2; |
| |
| // Gather a SIMD of vertices |
| // APIs allow a 4GB range for offsets |
| // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( |
| // But, we know that elements must be aligned for FETCH. :) |
| // Right shift the offset by a bit and then scale by 2 to remove the sign extension. |
| Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); |
| Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); |
| vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); |
| vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2)); |
| |
| currentVertexElement += 1; |
| } |
| else |
| { |
| vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); |
| vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); |
| |
| currentVertexElement += 1; |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| #if USE_SIMD16_BUILDER |
| Value *pVtxSrc2[4]; |
| |
| // pack adjacent pairs of SIMD8s into SIMD16s |
| for (uint32_t i = 0; i < 4; i += 1) |
| { |
| pVtxSrc2[i] = VUNDEF2_F(); |
| |
| pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i], 0); |
| pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1); |
| } |
| |
| // store SIMD16s |
| Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); |
| StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); |
| |
| #else |
| StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); |
| StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); |
| |
| #endif |
| outputElt += 1; |
| |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| |
| // offset base to the next component in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| #else |
| if (isComponentEnabled(compMask, i)) |
| { |
| // if we need to gather the component |
| if (compCtrl[i] == StoreSrc) |
| { |
| // save mask as it is zero'd out after each gather |
| Value *vMask = vGatherMask; |
| |
| // Gather a SIMD of vertices |
| // APIs allow a 4GB range for offsets |
| // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( |
| // But, we know that elements must be aligned for FETCH. :) |
| // Right shift the offset by a bit and then scale by 2 to remove the sign extension. |
| Value* vShiftedOffsets = VPSRLI(vOffsets, C(1)); |
| vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| |
| // offset base to the next component in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| #endif |
| } |
| } |
| break; |
| case 64: |
| { |
| for (uint32_t i = 0; i < 4; i += 1) |
| { |
| #if USE_SIMD16_GATHERS |
| if (isComponentEnabled(compMask, i)) |
| { |
| // if we need to gather the component |
| if (compCtrl[i] == StoreSrc) |
| { |
| Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); |
| Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); |
| Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); |
| Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); |
| vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4)); |
| vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4)); |
| vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4)); |
| vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4)); |
| vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4)); |
| vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4)); |
| vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4)); |
| vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4)); |
| |
| Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); |
| Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0)); |
| Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); |
| Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1)); |
| |
| Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); |
| |
| Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); |
| Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1)); |
| Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); |
| Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1)); |
| |
| pGatherLo = VCVTPD2PS(pGatherLo); |
| pGatherLo2 = VCVTPD2PS(pGatherLo2); |
| pGatherHi = VCVTPD2PS(pGatherHi); |
| pGatherHi2 = VCVTPD2PS(pGatherHi2); |
| |
| Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); |
| Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); |
| |
| vVertexElements[currentVertexElement] = pGather; |
| vVertexElements2[currentVertexElement] = pGather2; |
| |
| currentVertexElement += 1; |
| } |
| else |
| { |
| vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); |
| vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); |
| |
| currentVertexElement += 1; |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); |
| StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); |
| |
| outputElt += 1; |
| |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| |
| // offset base to the next component in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)8)); |
| #else |
| if (isComponentEnabled(compMask, i)) |
| { |
| // if we need to gather the component |
| if (compCtrl[i] == StoreSrc) |
| { |
| Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({0, 1, 2, 3})); |
| Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({4, 5, 6, 7})); |
| vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4)); |
| vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4)); |
| vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4)); |
| vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4)); |
| |
| Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); |
| Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); |
| |
| Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); |
| |
| Value* pGatherLo = GATHERPD(vZeroDouble, |
| pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); |
| Value* pGatherHi = GATHERPD(vZeroDouble, |
| pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); |
| |
| pGatherLo = VCVTPD2PS(pGatherLo); |
| pGatherHi = VCVTPD2PS(pGatherHi); |
| |
| Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({0, 1, 2, 3, 4, 5, 6, 7})); |
| |
| vVertexElements[currentVertexElement++] = pGather; |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| |
| // offset base to the next component in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)8)); |
| #endif |
| } |
| } |
| break; |
| default: |
| SWR_INVALID("Tried to fetch invalid FP format"); |
| break; |
| } |
| } |
| else |
| { |
| Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; |
| ConversionType conversionType = CONVERT_NONE; |
| |
| SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), |
| "Unsupported format for standard gather fetch."); |
| |
| switch(info.type[0]) |
| { |
| case SWR_TYPE_UNORM: |
| conversionType = CONVERT_NORMALIZED; |
| case SWR_TYPE_UINT: |
| extendCastType = Instruction::CastOps::ZExt; |
| break; |
| case SWR_TYPE_SNORM: |
| conversionType = CONVERT_NORMALIZED; |
| case SWR_TYPE_SINT: |
| extendCastType = Instruction::CastOps::SExt; |
| break; |
| case SWR_TYPE_USCALED: |
| conversionType = CONVERT_USCALED; |
| extendCastType = Instruction::CastOps::UIToFP; |
| break; |
| case SWR_TYPE_SSCALED: |
| conversionType = CONVERT_SSCALED; |
| extendCastType = Instruction::CastOps::SIToFP; |
| break; |
| case SWR_TYPE_SFIXED: |
| conversionType = CONVERT_SFIXED; |
| extendCastType = Instruction::CastOps::SExt; |
| break; |
| default: |
| break; |
| } |
| |
| // value substituted when component of gather is masked |
| Value* gatherSrc = VIMMED1(0); |
| #if USE_SIMD16_GATHERS |
| Value* gatherSrc2 = VIMMED1(0); |
| #endif |
| |
| // Gather components from memory to store in a simdvertex structure |
| switch (bpc) |
| { |
| case 8: |
| { |
| // if we have at least one component to fetch |
| if (compMask) |
| { |
| #if USE_SIMD16_GATHERS |
| Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); |
| Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1)); |
| // e.g. result of an 8x32bit integer gather for 8bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw |
| |
| Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); |
| Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref |
| Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref |
| #else |
| Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); |
| // e.g. result of an 8x32bit integer gather for 8bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw |
| |
| Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| #if USE_SIMD16_SHADERS |
| Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref |
| #else |
| Shuffle8bpcGatherd(args); // outputs to vVertexElements ref |
| #endif |
| #endif |
| } |
| } |
| break; |
| case 16: |
| { |
| #if USE_SIMD16_GATHERS |
| Value* vGatherResult[2]; |
| Value *vMask; |
| Value* vGatherResult2[2]; |
| Value *vMask2; |
| |
| // if we have at least one component out of x or y to fetch |
| if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) |
| { |
| // save mask as it is zero'd out after each gather |
| vMask = vGatherMask; |
| vMask2 = vGatherMask2; |
| |
| vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); |
| // e.g. result of first 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy |
| // |
| } |
| |
| // if we have at least one component out of z or w to fetch |
| if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) |
| { |
| // offset base to the next components(zw) in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| vMask = vGatherMask; |
| vMask2 = vGatherMask2; |
| |
| vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); |
| // e.g. result of second 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw |
| // |
| } |
| |
| // if we have at least one component to shuffle into place |
| if (compMask) |
| { |
| Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); |
| Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| Shuffle16bpcGather(args, false); // outputs to vVertexElements ref |
| Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref |
| } |
| #else |
| Value* vGatherResult[2]; |
| Value *vMask; |
| |
| // if we have at least one component out of x or y to fetch |
| if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ |
| // save mask as it is zero'd out after each gather |
| vMask = vGatherMask; |
| |
| vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| // e.g. result of first 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy |
| // |
| } |
| |
| // if we have at least one component out of z or w to fetch |
| if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ |
| // offset base to the next components(zw) in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| vMask = vGatherMask; |
| |
| vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| // e.g. result of second 8x32bit integer gather for 16bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw |
| // |
| } |
| |
| // if we have at least one component to shuffle into place |
| if(compMask){ |
| Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, |
| currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); |
| |
| // Shuffle gathered components into place in simdvertex struct |
| #if USE_SIMD16_SHADERS |
| Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref |
| #else |
| Shuffle16bpcGather(args); // outputs to vVertexElements ref |
| #endif |
| } |
| #endif |
| } |
| break; |
| case 32: |
| { |
| // Gathered components into place in simdvertex struct |
| for (uint32_t i = 0; i < 4; i++) |
| { |
| if (isComponentEnabled(compMask, i)) |
| { |
| // if we need to gather the component |
| if (compCtrl[i] == StoreSrc) |
| { |
| #if USE_SIMD16_GATHERS |
| // save mask as it is zero'd out after each gather |
| Value *vMask = vGatherMask; |
| Value *vMask2 = vGatherMask2; |
| |
| Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); |
| |
| if (conversionType == CONVERT_USCALED) |
| { |
| pGather = UI_TO_FP(pGather, mSimdFP32Ty); |
| pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty); |
| } |
| else if (conversionType == CONVERT_SSCALED) |
| { |
| pGather = SI_TO_FP(pGather, mSimdFP32Ty); |
| pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty); |
| } |
| else if (conversionType == CONVERT_SFIXED) |
| { |
| pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); |
| pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); |
| } |
| |
| vVertexElements[currentVertexElement] = pGather; |
| vVertexElements2[currentVertexElement] = pGather2; |
| // e.g. result of a single 8x32bit integer gather for 32bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx |
| |
| currentVertexElement += 1; |
| #else |
| // save mask as it is zero'd out after each gather |
| Value *vMask = vGatherMask; |
| |
| Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); |
| |
| if (conversionType == CONVERT_USCALED) |
| { |
| pGather = UI_TO_FP(pGather, mSimdFP32Ty); |
| } |
| else if (conversionType == CONVERT_SSCALED) |
| { |
| pGather = SI_TO_FP(pGather, mSimdFP32Ty); |
| } |
| else if (conversionType == CONVERT_SFIXED) |
| { |
| pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f))); |
| } |
| |
| vVertexElements[currentVertexElement++] = pGather; |
| // e.g. result of a single 8x32bit integer gather for 32bit components |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx |
| #endif |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| #if USE_SIMD16_GATHERS |
| vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); |
| vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); |
| |
| currentVertexElement += 1; |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #endif |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| #if USE_SIMD16_GATHERS |
| StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); |
| StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); |
| |
| outputElt += 1; |
| #else |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| #endif |
| |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| |
| } |
| |
| // offset base to the next component in the vertex to gather |
| pStreamBase = GEP(pStreamBase, C((char)4)); |
| } |
| } |
| break; |
| } |
| } |
| } |
| |
| // if we have a partially filled vVertexElement struct, output it |
| if (currentVertexElement > 0) |
| { |
| #if USE_SIMD16_GATHERS |
| StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements); |
| StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2); |
| |
| outputElt += 1; |
| #else |
| StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); |
| #endif |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Loads a simd of valid indices. OOB indices are set to 0 |
| /// *Note* have to do 16bit index checking in scalar until we have AVX-512 |
| /// support |
| /// @param pIndices - pointer to 8 bit indices |
| /// @param pLastIndex - pointer to last valid index |
| Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) |
| { |
| // can fit 2 16 bit integers per vWidth lane |
| Value* vIndices = VUNDEF_I(); |
| |
| // store 0 index on stack to be used to conditionally load from if index address is OOB |
| Value* pZeroIndex = ALLOCA(mInt8Ty); |
| STORE(C((uint8_t)0), pZeroIndex); |
| |
| // Load a SIMD of index pointers |
| for(int64_t lane = 0; lane < mVWidth; lane++) |
| { |
| // Calculate the address of the requested index |
| Value *pIndex = GEP(pIndices, C(lane)); |
| |
| // check if the address is less than the max index, |
| Value* mask = ICMP_ULT(pIndex, pLastIndex); |
| |
| // if valid, load the index. if not, load 0 from the stack |
| Value* pValid = SELECT(mask, pIndex, pZeroIndex); |
| Value *index = LOAD(pValid, "valid index"); |
| |
| // zero extended index to 32 bits and insert into the correct simd lane |
| index = Z_EXT(index, mInt32Ty); |
| vIndices = VINSERT(vIndices, index, lane); |
| } |
| return vIndices; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Loads a simd of valid indices. OOB indices are set to 0 |
| /// *Note* have to do 16bit index checking in scalar until we have AVX-512 |
| /// support |
| /// @param pIndices - pointer to 16 bit indices |
| /// @param pLastIndex - pointer to last valid index |
| Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) |
| { |
| // can fit 2 16 bit integers per vWidth lane |
| Value* vIndices = VUNDEF_I(); |
| |
| // store 0 index on stack to be used to conditionally load from if index address is OOB |
| Value* pZeroIndex = ALLOCA(mInt16Ty); |
| STORE(C((uint16_t)0), pZeroIndex); |
| |
| // Load a SIMD of index pointers |
| for(int64_t lane = 0; lane < mVWidth; lane++) |
| { |
| // Calculate the address of the requested index |
| Value *pIndex = GEP(pIndices, C(lane)); |
| |
| // check if the address is less than the max index, |
| Value* mask = ICMP_ULT(pIndex, pLastIndex); |
| |
| // if valid, load the index. if not, load 0 from the stack |
| Value* pValid = SELECT(mask, pIndex, pZeroIndex); |
| Value *index = LOAD(pValid, "valid index"); |
| |
| // zero extended index to 32 bits and insert into the correct simd lane |
| index = Z_EXT(index, mInt32Ty); |
| vIndices = VINSERT(vIndices, index, lane); |
| } |
| return vIndices; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Loads a simd of valid indices. OOB indices are set to 0 |
| /// @param pIndices - pointer to 32 bit indices |
| /// @param pLastIndex - pointer to last valid index |
| Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) |
| { |
| DataLayout dL(JM()->mpCurrentModule); |
| unsigned int ptrSize = dL.getPointerSize() * 8; // ptr size in bits |
| Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize)); |
| Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize)); |
| |
| // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) |
| Value* numIndicesLeft = SUB(iLastIndex,iIndices); |
| numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); |
| numIndicesLeft = SDIV(numIndicesLeft, C(4)); |
| |
| // create a vector of index counts from the base index ptr passed into the fetch |
| const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)}; |
| Constant* vIndexOffsets = ConstantVector::get(vecIndices); |
| |
| // compare index count to the max valid index |
| // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load |
| // vIndexOffsets 0 1 2 3 4 5 6 7 |
| // ------------------------------ |
| // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass |
| // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 |
| Value* vMaxIndex = VBROADCAST(numIndicesLeft); |
| Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets); |
| |
| // VMASKLOAD takes an *i8 src pointer |
| pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0)); |
| |
| // Load the indices; OOB loads 0 |
| return MASKLOADD(pIndices,vIndexMask); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, |
| /// denormalizes if needed, converts to F32 if needed, and positions in |
| // the proper SIMD rows to be output to the simdvertex structure |
| /// @param args: (tuple of args, listed below) |
| /// @param vGatherResult - 8 gathered 8bpc vertices |
| /// @param pVtxOut - base pointer to output simdvertex struct |
| /// @param extendType - sign extend or zero extend |
| /// @param bNormalized - do we need to denormalize? |
| /// @param currentVertexElement - reference to the current vVertexElement |
| /// @param outputElt - reference to the current offset from simdvertex we're o |
| /// @param compMask - component packing mask |
| /// @param compCtrl - component control val |
| /// @param vVertexElements[4] - vertex components to output |
| /// @param swizzle[4] - component swizzle location |
| #if USE_SIMD16_SHADERS |
| void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) |
| #else |
| void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) |
| #endif |
| { |
| // Unpack tuple args |
| Value*& vGatherResult = std::get<0>(args); |
| Value* pVtxOut = std::get<1>(args); |
| const Instruction::CastOps extendType = std::get<2>(args); |
| const ConversionType conversionType = std::get<3>(args); |
| uint32_t ¤tVertexElement = std::get<4>(args); |
| uint32_t &outputElt = std::get<5>(args); |
| const ComponentEnable compMask = std::get<6>(args); |
| const ComponentControl (&compCtrl)[4] = std::get<7>(args); |
| Value* (&vVertexElements)[4] = std::get<8>(args); |
| const uint32_t (&swizzle)[4] = std::get<9>(args); |
| |
| // cast types |
| Type* vGatherTy = mSimdInt32Ty; |
| Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits |
| |
| // have to do extra work for sign extending |
| if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ |
| Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane |
| Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits |
| |
| // shuffle mask, including any swizzling |
| const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; |
| const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; |
| Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12), |
| char(y), char(y+4), char(y+8), char(y+12), |
| char(z), char(z+4), char(z+8), char(z+12), |
| char(w), char(w+4), char(w+8), char(w+12), |
| char(x), char(x+4), char(x+8), char(x+12), |
| char(y), char(y+4), char(y+8), char(y+12), |
| char(z), char(z+4), char(z+8), char(z+12), |
| char(w), char(w+4), char(w+8), char(w+12)}); |
| |
| Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); |
| // after pshufb: group components together in each 128bit lane |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww |
| |
| Value* vi128XY = nullptr; |
| if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ |
| vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); |
| // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) |
| } |
| |
| // do the same for zw components |
| Value* vi128ZW = nullptr; |
| if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ |
| vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); |
| } |
| |
| // init denormalize variables if needed |
| Instruction::CastOps fpCast; |
| Value* conversionFactor; |
| |
| switch (conversionType) |
| { |
| case CONVERT_NORMALIZED: |
| fpCast = Instruction::CastOps::SIToFP; |
| conversionFactor = VIMMED1((float)(1.0 / 127.0)); |
| break; |
| case CONVERT_SSCALED: |
| fpCast = Instruction::CastOps::SIToFP; |
| conversionFactor = VIMMED1((float)(1.0)); |
| break; |
| case CONVERT_USCALED: |
| SWR_INVALID("Type should not be sign extended!"); |
| conversionFactor = nullptr; |
| break; |
| default: |
| SWR_ASSERT(conversionType == CONVERT_NONE); |
| conversionFactor = nullptr; |
| break; |
| } |
| |
| // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex |
| for (uint32_t i = 0; i < 4; i++) |
| { |
| if (isComponentEnabled(compMask, i)) |
| { |
| if (compCtrl[i] == ComponentControl::StoreSrc) |
| { |
| // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 |
| uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; |
| // if x or y, use vi128XY permute result, else use vi128ZW |
| Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; |
| |
| // sign extend |
| vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty)); |
| |
| // denormalize if needed |
| if (conversionType != CONVERT_NONE) |
| { |
| vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); |
| } |
| currentVertexElement++; |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| } |
| // else zero extend |
| else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) |
| { |
| // init denormalize variables if needed |
| Instruction::CastOps fpCast; |
| Value* conversionFactor; |
| |
| switch (conversionType) |
| { |
| case CONVERT_NORMALIZED: |
| fpCast = Instruction::CastOps::UIToFP; |
| conversionFactor = VIMMED1((float)(1.0 / 255.0)); |
| break; |
| case CONVERT_USCALED: |
| fpCast = Instruction::CastOps::UIToFP; |
| conversionFactor = VIMMED1((float)(1.0)); |
| break; |
| case CONVERT_SSCALED: |
| SWR_INVALID("Type should not be zero extended!"); |
| conversionFactor = nullptr; |
| break; |
| default: |
| SWR_ASSERT(conversionType == CONVERT_NONE); |
| conversionFactor = nullptr; |
| break; |
| } |
| |
| // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits |
| for (uint32_t i = 0; i < 4; i++) |
| { |
| if (isComponentEnabled(compMask, i)) |
| { |
| if (compCtrl[i] == ComponentControl::StoreSrc) |
| { |
| // pshufb masks for each component |
| Value* vConstMask; |
| switch (swizzle[i]) |
| { |
| case 0: |
| // x shuffle mask |
| vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, |
| 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 }); |
| break; |
| case 1: |
| // y shuffle mask |
| vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, |
| 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 }); |
| break; |
| case 2: |
| // z shuffle mask |
| vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, |
| 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 }); |
| break; |
| case 3: |
| // w shuffle mask |
| vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, |
| 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 }); |
| break; |
| default: |
| vConstMask = nullptr; |
| break; |
| } |
| |
| vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy); |
| // after pshufb for x channel |
| // 256i - 0 1 2 3 4 5 6 7 |
| // x000 x000 x000 x000 x000 x000 x000 x000 |
| |
| // denormalize if needed |
| if (conversionType != CONVERT_NONE) |
| { |
| vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); |
| } |
| currentVertexElement++; |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| } |
| else |
| { |
| SWR_INVALID("Unsupported conversion type"); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, |
| /// denormalizes if needed, converts to F32 if needed, and positions in |
| // the proper SIMD rows to be output to the simdvertex structure |
| /// @param args: (tuple of args, listed below) |
| /// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index |
| /// @param pVtxOut - base pointer to output simdvertex struct |
| /// @param extendType - sign extend or zero extend |
| /// @param bNormalized - do we need to denormalize? |
| /// @param currentVertexElement - reference to the current vVertexElement |
| /// @param outputElt - reference to the current offset from simdvertex we're o |
| /// @param compMask - component packing mask |
| /// @param compCtrl - component control val |
| /// @param vVertexElements[4] - vertex components to output |
| #if USE_SIMD16_SHADERS |
| void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) |
| #else |
| void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) |
| #endif |
| { |
| // Unpack tuple args |
| Value* (&vGatherResult)[2] = std::get<0>(args); |
| Value* pVtxOut = std::get<1>(args); |
| const Instruction::CastOps extendType = std::get<2>(args); |
| const ConversionType conversionType = std::get<3>(args); |
| uint32_t ¤tVertexElement = std::get<4>(args); |
| uint32_t &outputElt = std::get<5>(args); |
| const ComponentEnable compMask = std::get<6>(args); |
| const ComponentControl(&compCtrl)[4] = std::get<7>(args); |
| Value* (&vVertexElements)[4] = std::get<8>(args); |
| |
| // cast types |
| Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); |
| Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits |
| |
| // have to do extra work for sign extending |
| if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| |
| (extendType == Instruction::CastOps::FPExt)) |
| { |
| // is this PP float? |
| bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; |
| |
| Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane |
| Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits |
| |
| // shuffle mask |
| Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, |
| 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); |
| Value* vi128XY = nullptr; |
| if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ |
| Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); |
| // after pshufb: group components together in each 128bit lane |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy |
| |
| vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); |
| // after PERMD: move and pack xy components into each 128bit lane |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy |
| } |
| |
| // do the same for zw components |
| Value* vi128ZW = nullptr; |
| if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ |
| Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); |
| vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); |
| } |
| |
| // init denormalize variables if needed |
| Instruction::CastOps IntToFpCast; |
| Value* conversionFactor; |
| |
| switch (conversionType) |
| { |
| case CONVERT_NORMALIZED: |
| IntToFpCast = Instruction::CastOps::SIToFP; |
| conversionFactor = VIMMED1((float)(1.0 / 32767.0)); |
| break; |
| case CONVERT_SSCALED: |
| IntToFpCast = Instruction::CastOps::SIToFP; |
| conversionFactor = VIMMED1((float)(1.0)); |
| break; |
| case CONVERT_USCALED: |
| SWR_INVALID("Type should not be sign extended!"); |
| conversionFactor = nullptr; |
| break; |
| default: |
| SWR_ASSERT(conversionType == CONVERT_NONE); |
| conversionFactor = nullptr; |
| break; |
| } |
| |
| // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex |
| for (uint32_t i = 0; i < 4; i++) |
| { |
| if (isComponentEnabled(compMask, i)) |
| { |
| if (compCtrl[i] == ComponentControl::StoreSrc) |
| { |
| // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 |
| uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; |
| // if x or y, use vi128XY permute result, else use vi128ZW |
| Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; |
| |
| if (bFP) { |
| // extract 128 bit lanes to sign extend each component |
| vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); |
| } |
| else { |
| // extract 128 bit lanes to sign extend each component |
| vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); |
| |
| // denormalize if needed |
| if (conversionType != CONVERT_NONE) { |
| vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); |
| } |
| } |
| currentVertexElement++; |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| } |
| // else zero extend |
| else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) |
| { |
| // pshufb masks for each component |
| Value* vConstMask[2]; |
| if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ |
| // x/z shuffle mask |
| vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, |
| 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); |
| } |
| |
| if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ |
| // y/w shuffle mask |
| vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, |
| 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); |
| } |
| |
| // init denormalize variables if needed |
| Instruction::CastOps fpCast; |
| Value* conversionFactor; |
| |
| switch (conversionType) |
| { |
| case CONVERT_NORMALIZED: |
| fpCast = Instruction::CastOps::UIToFP; |
| conversionFactor = VIMMED1((float)(1.0 / 65535.0)); |
| break; |
| case CONVERT_USCALED: |
| fpCast = Instruction::CastOps::UIToFP; |
| conversionFactor = VIMMED1((float)(1.0f)); |
| break; |
| case CONVERT_SSCALED: |
| SWR_INVALID("Type should not be zero extended!"); |
| conversionFactor = nullptr; |
| break; |
| default: |
| SWR_ASSERT(conversionType == CONVERT_NONE); |
| conversionFactor = nullptr; |
| break; |
| } |
| |
| // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits |
| for (uint32_t i = 0; i < 4; i++) |
| { |
| if (isComponentEnabled(compMask, i)) |
| { |
| if (compCtrl[i] == ComponentControl::StoreSrc) |
| { |
| // select correct constMask for x/z or y/w pshufb |
| uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; |
| // if x or y, use vi128XY permute result, else use vi128ZW |
| uint32_t selectedGather = (i < 2) ? 0 : 1; |
| |
| vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); |
| // after pshufb mask for x channel; z uses the same shuffle from the second gather |
| // 256i - 0 1 2 3 4 5 6 7 |
| // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 |
| |
| // denormalize if needed |
| if (conversionType != CONVERT_NONE) |
| { |
| vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); |
| } |
| currentVertexElement++; |
| } |
| else |
| { |
| #if USE_SIMD16_SHADERS |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); |
| #else |
| vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); |
| #endif |
| } |
| |
| if (currentVertexElement > 3) |
| { |
| StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); |
| // reset to the next vVertexElement to output |
| currentVertexElement = 0; |
| } |
| } |
| } |
| } |
| else |
| { |
| SWR_INVALID("Unsupported conversion type"); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Output a simdvertex worth of elements to the current outputElt |
| /// @param pVtxOut - base address of VIN output struct |
| /// @param outputElt - simdvertex offset in VIN to write to |
| /// @param numEltsToStore - number of simdvertex rows to write out |
| /// @param vVertexElements - LLVM Value*[] simdvertex to write out |
| void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) |
| { |
| SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); |
| |
| for(uint32_t c = 0; c < numEltsToStore; ++c) |
| { |
| // STORE expects FP32 x vWidth type, just bitcast if needed |
| if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) |
| { |
| #if FETCH_DUMP_VERTEX |
| PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); |
| #endif |
| vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); |
| } |
| #if FETCH_DUMP_VERTEX |
| else |
| { |
| PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); |
| } |
| #endif |
| // outputElt * 4 = offsetting by the size of a simdvertex |
| // + c offsets to a 32bit x vWidth row within the current vertex |
| #if USE_SIMD16_SHADERS |
| Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); |
| #else |
| Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); |
| #endif |
| STORE(vVertexElements[c], dest); |
| } |
| } |
| |
| #if USE_SIMD16_BUILDER |
| void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) |
| { |
| SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); |
| |
| for (uint32_t c = 0; c < numEltsToStore; ++c) |
| { |
| // STORE expects FP32 x vWidth type, just bitcast if needed |
| if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) |
| { |
| #if FETCH_DUMP_VERTEX |
| PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); |
| #endif |
| vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty); |
| } |
| #if FETCH_DUMP_VERTEX |
| else |
| { |
| PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] }); |
| } |
| #endif |
| // outputElt * 4 = offsetting by the size of a simdvertex |
| // + c offsets to a 32bit x vWidth row within the current vertex |
| Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); |
| STORE(vVertexElements[c], dest); |
| } |
| } |
| |
| #endif |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Generates a constant vector of values based on the |
| /// ComponentControl value |
| /// @param ctrl - ComponentControl value |
| #if USE_SIMD16_SHADERS |
| Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) |
| #else |
| Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) |
| #endif |
| { |
| switch(ctrl) |
| { |
| case NoStore: return VUNDEF_I(); |
| case Store0: return VIMMED1(0); |
| case Store1Fp: return VIMMED1(1.0f); |
| case Store1Int: return VIMMED1(1); |
| case StoreVertexId: |
| { |
| #if USE_SIMD16_SHADERS |
| Value* pId; |
| if (useVertexID2) |
| { |
| pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); |
| } |
| else |
| { |
| pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); |
| } |
| #else |
| Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); |
| #endif |
| return VBROADCAST(pId); |
| } |
| case StoreInstanceId: |
| { |
| Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); |
| return VBROADCAST(pId); |
| } |
| case StoreSrc: |
| default: SWR_INVALID("Invalid component control"); return VUNDEF_I(); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Returns the enable mask for the specified component. |
| /// @param enableMask - enable bits |
| /// @param component - component to check if enabled. |
| bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) |
| { |
| switch (component) |
| { |
| // X |
| case 0: return (enableMask & ComponentEnable::X); |
| // Y |
| case 1: return (enableMask & ComponentEnable::Y); |
| // Z |
| case 2: return (enableMask & ComponentEnable::Z); |
| // W |
| case 3: return (enableMask & ComponentEnable::W); |
| |
| default: return false; |
| } |
| } |
| |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief JITs from fetch shader IR |
| /// @param hJitMgr - JitManager handle |
| /// @param func - LLVM function IR |
| /// @return PFN_FETCH_FUNC - pointer to fetch code |
| PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) |
| { |
| const llvm::Function* func = (const llvm::Function*)hFunc; |
| JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); |
| PFN_FETCH_FUNC pfnFetch; |
| |
| pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); |
| // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module |
| pJitMgr->mIsModuleFinalized = true; |
| |
| #if defined(KNOB_SWRC_TRACING) |
| char fName[1024]; |
| const char *funcName = func->getName().data(); |
| sprintf(fName, "%s.bin", funcName); |
| FILE *fd = fopen(fName, "wb"); |
| fwrite((void *)pfnFetch, 1, 2048, fd); |
| fclose(fd); |
| #endif |
| |
| pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final"); |
| |
| return pfnFetch; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief JIT compiles fetch shader |
| /// @param hJitMgr - JitManager handle |
| /// @param state - fetch state to build function from |
| extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) |
| { |
| JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); |
| |
| pJitMgr->SetupNewModule(); |
| |
| FetchJit theJit(pJitMgr); |
| HANDLE hFunc = theJit.Create(state); |
| |
| return JitFetchFunc(hJitMgr, hFunc); |
| } |