| /**************************************************************************** |
| * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * @file frontend.cpp |
| * |
| * @brief Implementation for Frontend which handles vertex processing, |
| * primitive assembly, clipping, binning, etc. |
| * |
| ******************************************************************************/ |
| |
| #include "api.h" |
| #include "frontend.h" |
| #include "backend.h" |
| #include "context.h" |
| #include "rdtsc_core.h" |
| #include "utils.h" |
| #include "threads.h" |
| #include "pa.h" |
| #include "clip.h" |
| #include "tilemgr.h" |
| #include "tessellator.h" |
| #include <limits> |
| #include <iostream> |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Helper macro to generate a bitmask |
| static INLINE uint32_t GenMask(uint32_t numBits) |
| { |
| SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__); |
| return ((1U << numBits) - 1); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrSync. |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pUserData - Pointer to user data passed back to sync callback. |
| /// @todo This should go away when we switch this to use compute threading. |
| void ProcessSync( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| BE_WORK work; |
| work.type = SYNC; |
| work.pfnWork = ProcessSyncBE; |
| |
| MacroTileMgr *pTileMgr = pDC->pTileMgr; |
| pTileMgr->enqueue(0, 0, &work); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrDestroyContext. |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pUserData - Pointer to user data passed back to sync callback. |
| void ProcessShutdown( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| BE_WORK work; |
| work.type = SHUTDOWN; |
| work.pfnWork = ProcessShutdownBE; |
| |
| MacroTileMgr *pTileMgr = pDC->pTileMgr; |
| // Enqueue at least 1 work item for each worker thread |
| // account for number of numa nodes |
| uint32_t numNumaNodes = pContext->threadPool.numaMask + 1; |
| |
| for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i) |
| { |
| for (uint32_t n = 0; n < numNumaNodes; ++n) |
| { |
| pTileMgr->enqueue(i, n, &work); |
| } |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrClearRenderTarget. |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pUserData - Pointer to user data passed back to clear callback. |
| /// @todo This should go away when we switch this to use compute threading. |
| void ProcessClear( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData; |
| MacroTileMgr *pTileMgr = pDC->pTileMgr; |
| |
| // queue a clear to each macro tile |
| // compute macro tile bounds for the specified rect |
| uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; |
| uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; |
| uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; |
| uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; |
| |
| BE_WORK work; |
| work.type = CLEAR; |
| work.pfnWork = ProcessClearBE; |
| work.desc.clear = *pDesc; |
| |
| for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) |
| { |
| for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) |
| { |
| pTileMgr->enqueue(x, y, &work); |
| } |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrStoreTiles. |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pUserData - Pointer to user data passed back to callback. |
| /// @todo This should go away when we switch this to use compute threading. |
| void ProcessStoreTiles( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| AR_BEGIN(FEProcessStoreTiles, pDC->drawId); |
| MacroTileMgr *pTileMgr = pDC->pTileMgr; |
| STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData; |
| |
| // queue a store to each macro tile |
| // compute macro tile bounds for the specified rect |
| uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; |
| uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; |
| uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; |
| uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; |
| |
| // store tiles |
| BE_WORK work; |
| work.type = STORETILES; |
| work.pfnWork = ProcessStoreTilesBE; |
| work.desc.storeTiles = *pDesc; |
| |
| for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) |
| { |
| for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) |
| { |
| pTileMgr->enqueue(x, y, &work); |
| } |
| } |
| |
| AR_END(FEProcessStoreTiles, 0); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrInvalidateTiles. |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pUserData - Pointer to user data passed back to callback. |
| /// @todo This should go away when we switch this to use compute threading. |
| void ProcessDiscardInvalidateTiles( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId); |
| DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; |
| MacroTileMgr *pTileMgr = pDC->pTileMgr; |
| |
| // compute macro tile bounds for the specified rect |
| uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM; |
| uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1; |
| uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM; |
| uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1; |
| |
| if (pDesc->fullTilesOnly == false) |
| { |
| // include partial tiles |
| macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM; |
| macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM; |
| macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM; |
| macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM; |
| } |
| |
| SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X); |
| SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y); |
| |
| macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X); |
| macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y); |
| |
| // load tiles |
| BE_WORK work; |
| work.type = DISCARDINVALIDATETILES; |
| work.pfnWork = ProcessDiscardInvalidateTilesBE; |
| work.desc.discardInvalidateTiles = *pDesc; |
| |
| for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x) |
| { |
| for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y) |
| { |
| pTileMgr->enqueue(x, y, &work); |
| } |
| } |
| |
| AR_END(FEProcessInvalidateTiles, 0); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Computes the number of primitives given the number of verts. |
| /// @param mode - primitive topology for draw operation. |
| /// @param numPrims - number of vertices or indices for draw. |
| /// @todo Frontend needs to be refactored. This will go in appropriate place then. |
| uint32_t GetNumPrims( |
| PRIMITIVE_TOPOLOGY mode, |
| uint32_t numPrims) |
| { |
| switch (mode) |
| { |
| case TOP_POINT_LIST: return numPrims; |
| case TOP_TRIANGLE_LIST: return numPrims / 3; |
| case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2; |
| case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2; |
| case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1; |
| case TOP_QUAD_LIST: return numPrims / 4; |
| case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2; |
| case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1; |
| case TOP_LINE_LIST: return numPrims / 2; |
| case TOP_LINE_LOOP: return numPrims; |
| case TOP_RECT_LIST: return numPrims / 3; |
| case TOP_LINE_LIST_ADJ: return numPrims / 4; |
| case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3; |
| case TOP_TRI_LIST_ADJ: return numPrims / 6; |
| case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2; |
| |
| case TOP_PATCHLIST_1: |
| case TOP_PATCHLIST_2: |
| case TOP_PATCHLIST_3: |
| case TOP_PATCHLIST_4: |
| case TOP_PATCHLIST_5: |
| case TOP_PATCHLIST_6: |
| case TOP_PATCHLIST_7: |
| case TOP_PATCHLIST_8: |
| case TOP_PATCHLIST_9: |
| case TOP_PATCHLIST_10: |
| case TOP_PATCHLIST_11: |
| case TOP_PATCHLIST_12: |
| case TOP_PATCHLIST_13: |
| case TOP_PATCHLIST_14: |
| case TOP_PATCHLIST_15: |
| case TOP_PATCHLIST_16: |
| case TOP_PATCHLIST_17: |
| case TOP_PATCHLIST_18: |
| case TOP_PATCHLIST_19: |
| case TOP_PATCHLIST_20: |
| case TOP_PATCHLIST_21: |
| case TOP_PATCHLIST_22: |
| case TOP_PATCHLIST_23: |
| case TOP_PATCHLIST_24: |
| case TOP_PATCHLIST_25: |
| case TOP_PATCHLIST_26: |
| case TOP_PATCHLIST_27: |
| case TOP_PATCHLIST_28: |
| case TOP_PATCHLIST_29: |
| case TOP_PATCHLIST_30: |
| case TOP_PATCHLIST_31: |
| case TOP_PATCHLIST_32: |
| return numPrims / (mode - TOP_PATCHLIST_BASE); |
| |
| case TOP_POLYGON: |
| case TOP_POINT_LIST_BF: |
| case TOP_LINE_STRIP_CONT: |
| case TOP_LINE_STRIP_BF: |
| case TOP_LINE_STRIP_CONT_BF: |
| case TOP_TRIANGLE_FAN_NOSTIPPLE: |
| case TOP_TRI_STRIP_REVERSE: |
| case TOP_PATCHLIST_BASE: |
| case TOP_UNKNOWN: |
| SWR_INVALID("Unsupported topology: %d", mode); |
| return 0; |
| } |
| |
| return 0; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Computes the number of verts given the number of primitives. |
| /// @param mode - primitive topology for draw operation. |
| /// @param numPrims - number of primitives for draw. |
| uint32_t GetNumVerts( |
| PRIMITIVE_TOPOLOGY mode, |
| uint32_t numPrims) |
| { |
| switch (mode) |
| { |
| case TOP_POINT_LIST: return numPrims; |
| case TOP_TRIANGLE_LIST: return numPrims * 3; |
| case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0; |
| case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0; |
| case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0; |
| case TOP_QUAD_LIST: return numPrims * 4; |
| case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0; |
| case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0; |
| case TOP_LINE_LIST: return numPrims * 2; |
| case TOP_LINE_LOOP: return numPrims; |
| case TOP_RECT_LIST: return numPrims * 3; |
| case TOP_LINE_LIST_ADJ: return numPrims * 4; |
| case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0; |
| case TOP_TRI_LIST_ADJ: return numPrims * 6; |
| case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0; |
| |
| case TOP_PATCHLIST_1: |
| case TOP_PATCHLIST_2: |
| case TOP_PATCHLIST_3: |
| case TOP_PATCHLIST_4: |
| case TOP_PATCHLIST_5: |
| case TOP_PATCHLIST_6: |
| case TOP_PATCHLIST_7: |
| case TOP_PATCHLIST_8: |
| case TOP_PATCHLIST_9: |
| case TOP_PATCHLIST_10: |
| case TOP_PATCHLIST_11: |
| case TOP_PATCHLIST_12: |
| case TOP_PATCHLIST_13: |
| case TOP_PATCHLIST_14: |
| case TOP_PATCHLIST_15: |
| case TOP_PATCHLIST_16: |
| case TOP_PATCHLIST_17: |
| case TOP_PATCHLIST_18: |
| case TOP_PATCHLIST_19: |
| case TOP_PATCHLIST_20: |
| case TOP_PATCHLIST_21: |
| case TOP_PATCHLIST_22: |
| case TOP_PATCHLIST_23: |
| case TOP_PATCHLIST_24: |
| case TOP_PATCHLIST_25: |
| case TOP_PATCHLIST_26: |
| case TOP_PATCHLIST_27: |
| case TOP_PATCHLIST_28: |
| case TOP_PATCHLIST_29: |
| case TOP_PATCHLIST_30: |
| case TOP_PATCHLIST_31: |
| case TOP_PATCHLIST_32: |
| return numPrims * (mode - TOP_PATCHLIST_BASE); |
| |
| case TOP_POLYGON: |
| case TOP_POINT_LIST_BF: |
| case TOP_LINE_STRIP_CONT: |
| case TOP_LINE_STRIP_BF: |
| case TOP_LINE_STRIP_CONT_BF: |
| case TOP_TRIANGLE_FAN_NOSTIPPLE: |
| case TOP_TRI_STRIP_REVERSE: |
| case TOP_PATCHLIST_BASE: |
| case TOP_UNKNOWN: |
| SWR_INVALID("Unsupported topology: %d", mode); |
| return 0; |
| } |
| |
| return 0; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Return number of verts per primitive. |
| /// @param topology - topology |
| /// @param includeAdjVerts - include adjacent verts in primitive vertices |
| INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts) |
| { |
| uint32_t numVerts = 0; |
| switch (topology) |
| { |
| case TOP_POINT_LIST: |
| case TOP_POINT_LIST_BF: |
| numVerts = 1; |
| break; |
| case TOP_LINE_LIST: |
| case TOP_LINE_STRIP: |
| case TOP_LINE_LIST_ADJ: |
| case TOP_LINE_LOOP: |
| case TOP_LINE_STRIP_CONT: |
| case TOP_LINE_STRIP_BF: |
| case TOP_LISTSTRIP_ADJ: |
| numVerts = 2; |
| break; |
| case TOP_TRIANGLE_LIST: |
| case TOP_TRIANGLE_STRIP: |
| case TOP_TRIANGLE_FAN: |
| case TOP_TRI_LIST_ADJ: |
| case TOP_TRI_STRIP_ADJ: |
| case TOP_TRI_STRIP_REVERSE: |
| case TOP_RECT_LIST: |
| numVerts = 3; |
| break; |
| case TOP_QUAD_LIST: |
| case TOP_QUAD_STRIP: |
| numVerts = 4; |
| break; |
| case TOP_PATCHLIST_1: |
| case TOP_PATCHLIST_2: |
| case TOP_PATCHLIST_3: |
| case TOP_PATCHLIST_4: |
| case TOP_PATCHLIST_5: |
| case TOP_PATCHLIST_6: |
| case TOP_PATCHLIST_7: |
| case TOP_PATCHLIST_8: |
| case TOP_PATCHLIST_9: |
| case TOP_PATCHLIST_10: |
| case TOP_PATCHLIST_11: |
| case TOP_PATCHLIST_12: |
| case TOP_PATCHLIST_13: |
| case TOP_PATCHLIST_14: |
| case TOP_PATCHLIST_15: |
| case TOP_PATCHLIST_16: |
| case TOP_PATCHLIST_17: |
| case TOP_PATCHLIST_18: |
| case TOP_PATCHLIST_19: |
| case TOP_PATCHLIST_20: |
| case TOP_PATCHLIST_21: |
| case TOP_PATCHLIST_22: |
| case TOP_PATCHLIST_23: |
| case TOP_PATCHLIST_24: |
| case TOP_PATCHLIST_25: |
| case TOP_PATCHLIST_26: |
| case TOP_PATCHLIST_27: |
| case TOP_PATCHLIST_28: |
| case TOP_PATCHLIST_29: |
| case TOP_PATCHLIST_30: |
| case TOP_PATCHLIST_31: |
| case TOP_PATCHLIST_32: |
| numVerts = topology - TOP_PATCHLIST_BASE; |
| break; |
| default: |
| SWR_INVALID("Unsupported topology: %d", topology); |
| break; |
| } |
| |
| if (includeAdjVerts) |
| { |
| switch (topology) |
| { |
| case TOP_LISTSTRIP_ADJ: |
| case TOP_LINE_LIST_ADJ: numVerts = 4; break; |
| case TOP_TRI_STRIP_ADJ: |
| case TOP_TRI_LIST_ADJ: numVerts = 6; break; |
| default: break; |
| } |
| } |
| |
| return numVerts; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Generate mask from remaining work. |
| /// @param numWorkItems - Number of items being worked on by a SIMD. |
| static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) |
| { |
| uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining; |
| uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0; |
| return _simd_castps_si(_simd_vmask_ps(mask)); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief StreamOut - Streams vertex data out to SO buffers. |
| /// Generally, we are only streaming out a SIMDs worth of triangles. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris) |
| static void StreamOut( |
| DRAW_CONTEXT* pDC, |
| PA_STATE& pa, |
| uint32_t workerId, |
| uint32_t* pPrimData, |
| uint32_t streamIndex) |
| { |
| SWR_CONTEXT *pContext = pDC->pContext; |
| |
| AR_BEGIN(FEStreamout, pDC->drawId); |
| |
| const API_STATE& state = GetApiState(pDC); |
| const SWR_STREAMOUT_STATE &soState = state.soState; |
| |
| uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); |
| |
| // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex. |
| uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t); |
| |
| SWR_STREAMOUT_CONTEXT soContext = { 0 }; |
| |
| // Setup buffer state pointers. |
| for (uint32_t i = 0; i < 4; ++i) |
| { |
| soContext.pBuffer[i] = &state.soBuffer[i]; |
| } |
| |
| uint32_t numPrims = pa.NumPrims(); |
| |
| for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) |
| { |
| DWORD slot = 0; |
| uint32_t soMask = soState.streamMasks[streamIndex]; |
| |
| // Write all entries into primitive data buffer for SOS. |
| while (_BitScanForward(&slot, soMask)) |
| { |
| simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide) |
| uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex]; |
| pa.AssembleSingle(paSlot, primIndex, attrib); |
| |
| // Attribute offset is relative offset from start of vertex. |
| // Note that attributes start at slot 1 in the PA buffer. We need to write this |
| // to prim data starting at slot 0. Which is why we do (slot - 1). |
| // Also note: GL works slightly differently, and needs slot 0 |
| uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t); |
| |
| // Store each vertex's attrib at appropriate locations in pPrimData buffer. |
| for (uint32_t v = 0; v < soVertsPerPrim; ++v) |
| { |
| uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride); |
| |
| _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); |
| } |
| |
| soMask &= ~(1 << slot); |
| } |
| |
| // Update pPrimData pointer |
| soContext.pPrimData = pPrimData; |
| |
| // Call SOS |
| SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function."); |
| state.pfnSoFunc[streamIndex](soContext); |
| } |
| |
| // Update SO write offset. The driver provides memory for the update. |
| for (uint32_t i = 0; i < 4; ++i) |
| { |
| if (state.soBuffer[i].pWriteOffset) |
| { |
| *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); |
| } |
| |
| if (state.soBuffer[i].soWriteEnable) |
| { |
| pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); |
| pDC->dynState.SoWriteOffsetDirty[i] = true; |
| } |
| } |
| |
| UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded); |
| UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten); |
| |
| AR_END(FEStreamout, 1); |
| } |
| |
| #if USE_SIMD16_FRONTEND |
| ////////////////////////////////////////////////////////////////////////// |
| /// Is value an even number (a multiple of two) |
| /// |
| template <typename T> |
| INLINE static bool IsEven(T value) |
| { |
| return (value & 1) == 0; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// Round up value to an even number (a multiple of two) |
| /// |
| template <typename T> |
| INLINE static T RoundUpEven(T value) |
| { |
| return (value + 1) & ~1; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// Round down value to an even number (a multiple of two) |
| /// |
| template <typename T> |
| INLINE static T RoundDownEven(T value) |
| { |
| return value & ~1; |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping |
| /// |
| /// vertexCount is in terms of the source simdvertexes and must be even |
| /// |
| /// attribCount will limit the vector copies to those attribs specified |
| /// |
| /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS |
| /// |
| void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) |
| { |
| SWR_ASSERT(vertex); |
| SWR_ASSERT(vertex_simd16); |
| SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS); |
| |
| simd16vertex temp; |
| |
| for (uint32_t i = 0; i < vertexCount; i += 2) |
| { |
| for (uint32_t j = 0; j < attribCount; j += 1) |
| { |
| for (uint32_t k = 0; k < 4; k += 1) |
| { |
| temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); |
| |
| if ((i + 1) < vertexCount) |
| { |
| temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); |
| } |
| } |
| } |
| |
| for (uint32_t j = 0; j < attribCount; j += 1) |
| { |
| vertex_simd16[i >> 1].attrib[j] = temp.attrib[j]; |
| } |
| } |
| } |
| |
| #endif |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Computes number of invocations. The current index represents |
| /// the start of the SIMD. The max index represents how much work |
| /// items are remaining. If there is less then a SIMD's xmin of work |
| /// then return the remaining amount of work. |
| /// @param curIndex - The start index for the SIMD. |
| /// @param maxIndex - The last index for all work items. |
| static INLINE uint32_t GetNumInvocations( |
| uint32_t curIndex, |
| uint32_t maxIndex) |
| { |
| uint32_t remainder = (maxIndex - curIndex); |
| #if USE_SIMD16_FRONTEND |
| return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder; |
| #else |
| return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder; |
| #endif |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Converts a streamId buffer to a cut buffer for the given stream id. |
| /// The geometry shader will loop over each active streamout buffer, assembling |
| /// primitives for the downstream stages. When multistream output is enabled, |
| /// the generated stream ID buffer from the GS needs to be converted to a cut |
| /// buffer for the primitive assembler. |
| /// @param stream - stream id to generate the cut buffer for |
| /// @param pStreamIdBase - pointer to the stream ID buffer |
| /// @param numEmittedVerts - Number of total verts emitted by the GS |
| /// @param pCutBuffer - output buffer to write cuts to |
| void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer) |
| { |
| SWR_ASSERT(stream < MAX_SO_STREAMS); |
| |
| uint32_t numInputBytes = (numEmittedVerts * 2 + 7) / 8; |
| uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U); |
| |
| for (uint32_t b = 0; b < numOutputBytes; ++b) |
| { |
| uint8_t curInputByte = pStreamIdBase[2*b]; |
| uint8_t outByte = 0; |
| for (uint32_t i = 0; i < 4; ++i) |
| { |
| if ((curInputByte & 0x3) != stream) |
| { |
| outByte |= (1 << i); |
| } |
| curInputByte >>= 2; |
| } |
| |
| curInputByte = pStreamIdBase[2 * b + 1]; |
| for (uint32_t i = 0; i < 4; ++i) |
| { |
| if ((curInputByte & 0x3) != stream) |
| { |
| outByte |= (1 << (i + 4)); |
| } |
| curInputByte >>= 2; |
| } |
| |
| *pCutBuffer++ = outByte; |
| } |
| } |
| |
| // Buffers that are allocated if GS is enabled |
| struct GsBuffers |
| { |
| uint8_t* pGsIn; |
| uint8_t* pGsOut[KNOB_SIMD_WIDTH]; |
| uint8_t* pGsTransposed; |
| void* pStreamCutBuffer; |
| }; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler |
| /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler |
| /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader |
| /// @param numVerts - Number of vertices outputted by the GS |
| /// @param numAttribs - Number of attributes per vertex |
| template<typename SIMD_T, uint32_t SimdWidth> |
| void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs) |
| { |
| uint32_t srcVertexStride = numAttribs * sizeof(float) * 4; |
| uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4; |
| |
| OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth]; |
| |
| for (uint32_t i = 0; i < SimdWidth; ++i) |
| { |
| gatherOffsets[i] = srcVertexStride * i; |
| } |
| auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]); |
| |
| uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth; |
| uint32_t remainingVerts = numVerts; |
| |
| for (uint32_t s = 0; s < numSimd; ++s) |
| { |
| uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth; |
| uint8_t* pDstBase = pDst + s * dstVertexStride; |
| |
| // Compute mask to prevent src overflow |
| uint32_t mask = std::min(remainingVerts, SimdWidth); |
| mask = GenMask(mask); |
| auto vMask = SIMD_T::vmask_ps(mask); |
| auto viMask = SIMD_T::castps_si(vMask); |
| |
| for (uint32_t a = 0; a < numAttribs; ++a) |
| { |
| auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); |
| auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask); |
| auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask); |
| auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask); |
| |
| SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); |
| SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY); |
| SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ); |
| SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW); |
| |
| pSrcBase += sizeof(float) * 4; |
| pDstBase += sizeof(typename SIMD_T::Float) * 4; |
| } |
| remainingVerts -= SimdWidth; |
| } |
| } |
| |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Implements GS stage. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pa - The primitive assembly object. |
| /// @param pGsOut - output stream for GS |
| template < |
| typename HasStreamOutT, |
| typename HasRastT> |
| static void GeometryShaderStage( |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| PA_STATE& pa, |
| GsBuffers* pGsBuffers, |
| uint32_t* pSoPrimData, |
| #if USE_SIMD16_FRONTEND |
| uint32_t numPrims_simd8, |
| #endif |
| simdscalari const &primID) |
| { |
| SWR_CONTEXT *pContext = pDC->pContext; |
| |
| AR_BEGIN(FEGeometryShader, pDC->drawId); |
| |
| const API_STATE& state = GetApiState(pDC); |
| const SWR_GS_STATE* pState = &state.gsState; |
| SWR_GS_CONTEXT gsContext; |
| |
| static uint8_t sNullBuffer[128] = { 0 }; |
| |
| for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) |
| { |
| gsContext.pStreams[i] = pGsBuffers->pGsOut[i]; |
| } |
| gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn; |
| gsContext.PrimitiveID = primID; |
| |
| uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); |
| simdvector attrib[MAX_NUM_VERTS_PER_PRIM]; |
| |
| // assemble all attributes for the input primitive |
| gsContext.inputVertStride = pState->inputVertStride; |
| for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot) |
| { |
| uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot; |
| uint32_t attribSlot = pState->vertexAttribOffset + slot; |
| pa.Assemble(srcAttribSlot, attrib); |
| |
| for (uint32_t i = 0; i < numVertsPerPrim; ++i) |
| { |
| gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i]; |
| } |
| } |
| |
| // assemble position |
| pa.Assemble(VERTEX_POSITION_SLOT, attrib); |
| for (uint32_t i = 0; i < numVertsPerPrim; ++i) |
| { |
| gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i]; |
| } |
| |
| // record valid prims from the frontend to avoid over binning the newly generated |
| // prims from the GS |
| #if USE_SIMD16_FRONTEND |
| uint32_t numInputPrims = numPrims_simd8; |
| #else |
| uint32_t numInputPrims = pa.NumPrims(); |
| #endif |
| |
| for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) |
| { |
| gsContext.InstanceID = instance; |
| gsContext.mask = GenerateMask(numInputPrims); |
| |
| // execute the geometry shader |
| state.pfnGsFunc(GetPrivateState(pDC), &gsContext); |
| |
| for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) |
| { |
| gsContext.pStreams[i] += pState->allocationSize; |
| } |
| } |
| |
| // set up new binner and state for the GS output topology |
| #if USE_SIMD16_FRONTEND |
| PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; |
| if (HasRastT::value) |
| { |
| switch (pState->outputTopology) |
| { |
| case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break; |
| case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break; |
| case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; |
| default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); |
| } |
| } |
| |
| #else |
| PFN_PROCESS_PRIMS pfnClipFunc = nullptr; |
| if (HasRastT::value) |
| { |
| switch (pState->outputTopology) |
| { |
| case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break; |
| case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break; |
| case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; |
| default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology); |
| } |
| } |
| |
| #endif |
| // foreach input prim: |
| // - setup a new PA based on the emitted verts for that prim |
| // - loop over the new verts, calling PA to assemble each prim |
| uint32_t* pPrimitiveId = (uint32_t*)&primID; |
| |
| uint32_t totalPrimsGenerated = 0; |
| for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim) |
| { |
| uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim]; |
| |
| // Vertex count is either emitted by shader or static |
| uint32_t vertexCount = 0; |
| if (pState->staticVertexCount) |
| { |
| vertexCount = pState->staticVertexCount; |
| } |
| else |
| { |
| // If emitted in shader, it should be the stored in the first dword of the output buffer |
| vertexCount = *(uint32_t*)pInstanceBase; |
| } |
| |
| for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) |
| { |
| uint32_t numEmittedVerts = vertexCount; |
| if (numEmittedVerts == 0) |
| { |
| continue; |
| } |
| |
| uint8_t* pBase = pInstanceBase + instance * pState->allocationSize; |
| uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset; |
| uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset; |
| |
| #if USE_SIMD16_FRONTEND |
| TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); |
| #else |
| TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize); |
| #endif |
| |
| uint32_t numAttribs = state.feNumAttributes; |
| |
| for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) |
| { |
| bool processCutVerts = false; |
| uint8_t* pCutBuffer = pCutBase; |
| |
| // assign default stream ID, only relevant when GS is outputting a single stream |
| uint32_t streamID = 0; |
| if (pState->isSingleStream) |
| { |
| processCutVerts = true; |
| streamID = pState->singleStreamID; |
| if (streamID != stream) continue; |
| } |
| else |
| { |
| // early exit if this stream is not enabled for streamout |
| if (HasStreamOutT::value && !state.soState.streamEnable[stream]) |
| { |
| continue; |
| } |
| |
| // multi-stream output, need to translate StreamID buffer to a cut buffer |
| ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer); |
| pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer; |
| processCutVerts = false; |
| } |
| |
| #if USE_SIMD16_FRONTEND |
| PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); |
| |
| #else |
| PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); |
| |
| #endif |
| while (gsPa.GetNextStreamOutput()) |
| { |
| do |
| { |
| #if USE_SIMD16_FRONTEND |
| simd16vector attrib_simd16[3]; |
| |
| bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16); |
| |
| #else |
| bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib); |
| |
| #endif |
| if (assemble) |
| { |
| totalPrimsGenerated += gsPa.NumPrims(); |
| |
| if (HasStreamOutT::value) |
| { |
| #if ENABLE_AVX512_SIMD16 |
| gsPa.useAlternateOffset = false; |
| #endif |
| StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); |
| } |
| |
| if (HasRastT::value && state.soState.streamToRasterizer == stream) |
| { |
| #if USE_SIMD16_FRONTEND |
| simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]); |
| { |
| gsPa.useAlternateOffset = false; |
| pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId); |
| } |
| #else |
| simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); |
| pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); |
| #endif |
| } |
| } |
| } while (gsPa.NextPrim()); |
| } |
| } |
| } |
| } |
| |
| // update GS pipeline stats |
| UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount); |
| UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated); |
| AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims)); |
| AR_END(FEGeometryShader, 1); |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Allocate GS buffers |
| /// @param pDC - pointer to draw context. |
| /// @param state - API state |
| /// @param ppGsOut - pointer to GS output buffer allocation |
| /// @param ppCutBuffer - pointer to GS output cut buffer allocation |
| template<typename SIMD_T, uint32_t SIMD_WIDTH> |
| static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers) |
| { |
| auto pArena = pDC->pArena; |
| SWR_ASSERT(pArena != nullptr); |
| SWR_ASSERT(state.gsState.gsEnable); |
| |
| const SWR_GS_STATE& gsState = state.gsState; |
| |
| // Allocate storage for vertex inputs |
| uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim; |
| pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32); |
| |
| // Allocate arena space to hold GS output verts |
| const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize; |
| |
| for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) |
| { |
| pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32); |
| } |
| |
| // Allocate storage for transposed GS output |
| uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH; |
| uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4); |
| pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32); |
| |
| // Allocate storage to hold temporary stream->cut buffer, if necessary |
| if (state.gsState.isSingleStream) |
| { |
| pGsBuffers->pStreamCutBuffer = nullptr; |
| } |
| else |
| { |
| pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Contains all data generated by the HS and passed to the |
| /// tessellator and DS. |
| struct TessellationThreadLocalData |
| { |
| SWR_HS_CONTEXT hsContext; |
| ScalarPatch patchData[KNOB_SIMD_WIDTH]; |
| void* pTxCtx; |
| size_t tsCtxSize; |
| |
| simdscalar* pDSOutput; |
| size_t dsOutputAllocSize; |
| }; |
| |
| THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Allocate tessellation data for this worker thread. |
| INLINE |
| static void AllocateTessellationData(SWR_CONTEXT* pContext) |
| { |
| /// @TODO - Don't use thread local storage. Use Worker local storage instead. |
| if (gt_pTessellationThreadData == nullptr) |
| { |
| gt_pTessellationThreadData = (TessellationThreadLocalData*) |
| AlignedMalloc(sizeof(TessellationThreadLocalData), 64); |
| memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData)); |
| } |
| } |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief Implements Tessellation Stages. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. Even thread has a unique id. |
| /// @param pa - The primitive assembly object. |
| /// @param pGsOut - output stream for GS |
| template < |
| typename HasGeometryShaderT, |
| typename HasStreamOutT, |
| typename HasRastT> |
| static void TessellationStages( |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| PA_STATE& pa, |
| GsBuffers* pGsBuffers, |
| uint32_t* pSoPrimData, |
| #if USE_SIMD16_FRONTEND |
| uint32_t numPrims_simd8, |
| #endif |
| simdscalari const &primID) |
| { |
| SWR_CONTEXT *pContext = pDC->pContext; |
| const API_STATE& state = GetApiState(pDC); |
| const SWR_TS_STATE& tsState = state.tsState; |
| |
| SWR_ASSERT(gt_pTessellationThreadData); |
| |
| HANDLE tsCtx = TSInitCtx( |
| tsState.domain, |
| tsState.partitioning, |
| tsState.tsOutputTopology, |
| gt_pTessellationThreadData->pTxCtx, |
| gt_pTessellationThreadData->tsCtxSize); |
| if (tsCtx == nullptr) |
| { |
| gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64); |
| tsCtx = TSInitCtx( |
| tsState.domain, |
| tsState.partitioning, |
| tsState.tsOutputTopology, |
| gt_pTessellationThreadData->pTxCtx, |
| gt_pTessellationThreadData->tsCtxSize); |
| } |
| SWR_ASSERT(tsCtx); |
| |
| #if USE_SIMD16_FRONTEND |
| PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr; |
| if (HasRastT::value) |
| { |
| switch (tsState.postDSTopology) |
| { |
| case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break; |
| case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break; |
| case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break; |
| default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); |
| } |
| } |
| |
| #else |
| PFN_PROCESS_PRIMS pfnClipFunc = nullptr; |
| if (HasRastT::value) |
| { |
| switch (tsState.postDSTopology) |
| { |
| case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break; |
| case TOP_LINE_LIST: pfnClipFunc = ClipLines; break; |
| case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break; |
| default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology); |
| } |
| } |
| |
| #endif |
| SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; |
| hsContext.pCPout = gt_pTessellationThreadData->patchData; |
| hsContext.PrimitiveID = primID; |
| |
| uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); |
| // Max storage for one attribute for an entire simdprimitive |
| simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM]; |
| |
| // assemble all attributes for the input primitives |
| for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) |
| { |
| uint32_t attribSlot = tsState.vertexAttribOffset + slot; |
| pa.Assemble(attribSlot, simdattrib); |
| |
| for (uint32_t i = 0; i < numVertsPerPrim; ++i) |
| { |
| hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i]; |
| } |
| } |
| |
| #if defined(_DEBUG) |
| memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); |
| #endif |
| |
| #if USE_SIMD16_FRONTEND |
| uint32_t numPrims = numPrims_simd8; |
| #else |
| uint32_t numPrims = pa.NumPrims(); |
| #endif |
| hsContext.mask = GenerateMask(numPrims); |
| |
| // Run the HS |
| AR_BEGIN(FEHullShader, pDC->drawId); |
| state.pfnHsFunc(GetPrivateState(pDC), &hsContext); |
| AR_END(FEHullShader, 0); |
| |
| UPDATE_STAT_FE(HsInvocations, numPrims); |
| |
| const uint32_t* pPrimId = (const uint32_t*)&primID; |
| |
| for (uint32_t p = 0; p < numPrims; ++p) |
| { |
| // Run Tessellator |
| SWR_TS_TESSELLATED_DATA tsData = { 0 }; |
| AR_BEGIN(FETessellation, pDC->drawId); |
| TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); |
| AR_EVENT(TessPrimCount(1)); |
| AR_END(FETessellation, 0); |
| |
| if (tsData.NumPrimitives == 0) |
| { |
| continue; |
| } |
| SWR_ASSERT(tsData.NumDomainPoints); |
| |
| // Allocate DS Output memory |
| uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH; |
| #if USE_SIMD16_FRONTEND |
| size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize; // simd8 -> simd16, padding |
| #else |
| size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize; |
| size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors; |
| #endif |
| if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize) |
| { |
| AlignedFree(gt_pTessellationThreadData->pDSOutput); |
| gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64); |
| gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize; |
| } |
| SWR_ASSERT(gt_pTessellationThreadData->pDSOutput); |
| SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize); |
| |
| #if defined(_DEBUG) |
| memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize); |
| #endif |
| |
| // Run Domain Shader |
| SWR_DS_CONTEXT dsContext; |
| dsContext.PrimitiveID = pPrimId[p]; |
| dsContext.pCpIn = &hsContext.pCPout[p]; |
| dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; |
| dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; |
| dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; |
| dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset; |
| #if USE_SIMD16_FRONTEND |
| dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16 |
| #else |
| dsContext.vectorStride = requiredDSVectorInvocations; |
| #endif |
| |
| uint32_t dsInvocations = 0; |
| |
| for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset) |
| { |
| dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations); |
| |
| AR_BEGIN(FEDomainShader, pDC->drawId); |
| state.pfnDsFunc(GetPrivateState(pDC), &dsContext); |
| AR_END(FEDomainShader, 0); |
| |
| dsInvocations += KNOB_SIMD_WIDTH; |
| } |
| UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints); |
| |
| #if USE_SIMD16_FRONTEND |
| SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16 |
| |
| #endif |
| PA_TESS tessPa( |
| pDC, |
| #if USE_SIMD16_FRONTEND |
| reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16 |
| dsContext.vectorStride / 2, // simd8 -> simd16 |
| #else |
| dsContext.pOutputData, |
| dsContext.vectorStride, |
| #endif |
| SWR_VTX_NUM_SLOTS, |
| tsState.numDsOutputAttribs, |
| tsData.ppIndices, |
| tsData.NumPrimitives, |
| tsState.postDSTopology, |
| numVertsPerPrim); |
| |
| while (tessPa.HasWork()) |
| { |
| #if USE_SIMD16_FRONTEND |
| const uint32_t numPrims = tessPa.NumPrims(); |
| const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); |
| const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; |
| |
| const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); |
| const simdscalari primID_lo = _simd16_extract_si(primID, 0); |
| const simdscalari primID_hi = _simd16_extract_si(primID, 1); |
| |
| #endif |
| if (HasGeometryShaderT::value) |
| { |
| #if USE_SIMD16_FRONTEND |
| tessPa.useAlternateOffset = false; |
| GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo); |
| |
| if (numPrims_hi) |
| { |
| tessPa.useAlternateOffset = true; |
| GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi); |
| } |
| #else |
| GeometryShaderStage<HasStreamOutT, HasRastT>( |
| pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID)); |
| #endif |
| } |
| else |
| { |
| if (HasStreamOutT::value) |
| { |
| #if ENABLE_AVX512_SIMD16 |
| tessPa.useAlternateOffset = false; |
| #endif |
| StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); |
| } |
| |
| if (HasRastT::value) |
| { |
| #if USE_SIMD16_FRONTEND |
| simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points |
| #else |
| simdvector prim[3]; // Only deal with triangles, lines, or points |
| #endif |
| AR_BEGIN(FEPAAssemble, pDC->drawId); |
| bool assemble = |
| #if USE_SIMD16_FRONTEND |
| tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); |
| #else |
| tessPa.Assemble(VERTEX_POSITION_SLOT, prim); |
| #endif |
| AR_END(FEPAAssemble, 1); |
| SWR_ASSERT(assemble); |
| |
| SWR_ASSERT(pfnClipFunc); |
| #if USE_SIMD16_FRONTEND |
| |
| { |
| tessPa.useAlternateOffset = false; |
| pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID); |
| } |
| #else |
| pfnClipFunc(pDC, tessPa, workerId, prim, |
| GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID)); |
| #endif |
| } |
| } |
| |
| tessPa.NextPrim(); |
| |
| } // while (tessPa.HasWork()) |
| } // for (uint32_t p = 0; p < numPrims; ++p) |
| |
| #if USE_SIMD16_FRONTEND |
| if (gt_pTessellationThreadData->pDSOutput != nullptr) |
| { |
| AlignedFree(gt_pTessellationThreadData->pDSOutput); |
| gt_pTessellationThreadData->pDSOutput = nullptr; |
| } |
| gt_pTessellationThreadData->dsOutputAllocSize = 0; |
| |
| #endif |
| TSDestroyCtx(tsCtx); |
| } |
| |
| THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr; |
| THREAD uint32_t gVertexStoreSize = 0; |
| |
| ////////////////////////////////////////////////////////////////////////// |
| /// @brief FE handler for SwrDraw. |
| /// @tparam IsIndexedT - Is indexed drawing enabled |
| /// @tparam HasTessellationT - Is tessellation enabled |
| /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled |
| /// @tparam HasStreamOutT - Is stream-out enabled |
| /// @tparam HasRastT - Is rasterization enabled |
| /// @param pContext - pointer to SWR context. |
| /// @param pDC - pointer to draw context. |
| /// @param workerId - thread's worker id. |
| /// @param pUserData - Pointer to DRAW_WORK |
| template < |
| typename IsIndexedT, |
| typename IsCutIndexEnabledT, |
| typename HasTessellationT, |
| typename HasGeometryShaderT, |
| typename HasStreamOutT, |
| typename HasRastT> |
| void ProcessDraw( |
| SWR_CONTEXT *pContext, |
| DRAW_CONTEXT *pDC, |
| uint32_t workerId, |
| void *pUserData) |
| { |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (KNOB_TOSS_QUEUE_FE) |
| { |
| return; |
| } |
| #endif |
| |
| AR_BEGIN(FEProcessDraw, pDC->drawId); |
| |
| DRAW_WORK& work = *(DRAW_WORK*)pUserData; |
| const API_STATE& state = GetApiState(pDC); |
| |
| uint32_t indexSize = 0; |
| uint32_t endVertex = work.numVerts; |
| |
| const int32_t* pLastRequestedIndex = nullptr; |
| if (IsIndexedT::value) |
| { |
| switch (work.type) |
| { |
| case R32_UINT: |
| indexSize = sizeof(uint32_t); |
| pLastRequestedIndex = &(work.pIB[endVertex]); |
| break; |
| case R16_UINT: |
| indexSize = sizeof(uint16_t); |
| // nasty address offset to last index |
| pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex])); |
| break; |
| case R8_UINT: |
| indexSize = sizeof(uint8_t); |
| // nasty address offset to last index |
| pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex])); |
| break; |
| default: |
| SWR_INVALID("Invalid work.type: %d", work.type); |
| } |
| } |
| else |
| { |
| // No cuts, prune partial primitives. |
| endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); |
| } |
| |
| #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR) |
| uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); |
| #endif |
| |
| GsBuffers gsBuffers; |
| if (HasGeometryShaderT::value) |
| { |
| #if USE_SIMD16_FRONTEND |
| AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); |
| #else |
| AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers); |
| #endif |
| } |
| |
| if (HasTessellationT::value) |
| { |
| SWR_ASSERT(state.tsState.tsEnable == true); |
| SWR_ASSERT(state.pfnHsFunc != nullptr); |
| SWR_ASSERT(state.pfnDsFunc != nullptr); |
| |
| AllocateTessellationData(pContext); |
| } |
| else |
| { |
| SWR_ASSERT(state.tsState.tsEnable == false); |
| SWR_ASSERT(state.pfnHsFunc == nullptr); |
| SWR_ASSERT(state.pfnDsFunc == nullptr); |
| } |
| |
| // allocate space for streamout input prim data |
| uint32_t* pSoPrimData = nullptr; |
| if (HasStreamOutT::value) |
| { |
| pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); |
| } |
| |
| const uint32_t vertexCount = NumVertsPerPrim(state.topology, true); |
| #if USE_SIMD16_FRONTEND |
| uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector); |
| #else |
| uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector); |
| #endif |
| |
| SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM); |
| |
| // Compute storage requirements for vertex store |
| // TODO: allocation needs to be rethought for better cut support |
| uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine |
| uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes; |
| |
| // grow the vertex store for the PA as necessary |
| if (gVertexStoreSize < vertexStoreSize) |
| { |
| if (gpVertexStore != nullptr) |
| { |
| AlignedFree(gpVertexStore); |
| gpVertexStore = nullptr; |
| } |
| |
| SWR_ASSERT(gpVertexStore == nullptr); |
| |
| gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64)); |
| gVertexStoreSize = vertexStoreSize; |
| |
| SWR_ASSERT(gpVertexStore != nullptr); |
| } |
| |
| // choose primitive assembler |
| |
| PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1)); |
| PA_STATE& pa = paFactory.GetPA(); |
| |
| #if USE_SIMD16_FRONTEND |
| #if USE_SIMD16_SHADERS |
| simd16vertex vin; |
| #else |
| simdvertex vin_lo; |
| simdvertex vin_hi; |
| #endif |
| SWR_VS_CONTEXT vsContext_lo; |
| SWR_VS_CONTEXT vsContext_hi; |
| |
| #if USE_SIMD16_SHADERS |
| vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin); |
| vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin); |
| #else |
| vsContext_lo.pVin = &vin_lo; |
| vsContext_hi.pVin = &vin_hi; |
| #endif |
| vsContext_lo.AlternateOffset = 0; |
| vsContext_hi.AlternateOffset = 1; |
| |
| SWR_FETCH_CONTEXT fetchInfo_lo = { 0 }; |
| |
| fetchInfo_lo.pStreams = &state.vertexBuffers[0]; |
| fetchInfo_lo.StartInstance = work.startInstance; |
| fetchInfo_lo.StartVertex = 0; |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo_lo.BaseVertex = work.baseVertex; |
| |
| // if the entire index buffer isn't being consumed, set the last index |
| // so that fetches < a SIMD wide will be masked off |
| fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); |
| if (pLastRequestedIndex < fetchInfo_lo.pLastIndex) |
| { |
| fetchInfo_lo.pLastIndex = pLastRequestedIndex; |
| } |
| } |
| else |
| { |
| fetchInfo_lo.StartVertex = work.startVertex; |
| } |
| |
| SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; |
| |
| const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); |
| |
| for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) |
| { |
| uint32_t i = 0; |
| |
| simd16scalari vIndex; |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo_lo.pIndices = work.pIB; |
| fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH |
| } |
| else |
| { |
| vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); |
| |
| fetchInfo_lo.pIndices = (const int32_t *)&vIndex; |
| fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH |
| } |
| |
| fetchInfo_lo.CurInstance = instanceNum; |
| fetchInfo_hi.CurInstance = instanceNum; |
| |
| vsContext_lo.InstanceID = instanceNum; |
| vsContext_hi.InstanceID = instanceNum; |
| |
| while (pa.HasWork()) |
| { |
| // GetNextVsOutput currently has the side effect of updating some PA state machine state. |
| // So we need to keep this outside of (i < endVertex) check. |
| |
| simdmask *pvCutIndices_lo = nullptr; |
| simdmask *pvCutIndices_hi = nullptr; |
| |
| if (IsIndexedT::value) |
| { |
| // simd16mask <=> simdmask[2] |
| |
| pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0]; |
| pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1]; |
| } |
| |
| simd16vertex &vout = pa.GetNextVsOutput(); |
| |
| vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout); |
| vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout); |
| |
| if (i < endVertex) |
| { |
| // 1. Execute FS/VS for a single SIMD. |
| AR_BEGIN(FEFetchShader, pDC->drawId); |
| #if USE_SIMD16_SHADERS |
| state.pfnFetchFunc(fetchInfo_lo, vin); |
| #else |
| state.pfnFetchFunc(fetchInfo_lo, vin_lo); |
| |
| if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH |
| { |
| state.pfnFetchFunc(fetchInfo_hi, vin_hi); |
| } |
| #endif |
| AR_END(FEFetchShader, 0); |
| |
| // forward fetch generated vertex IDs to the vertex shader |
| #if USE_SIMD16_SHADERS |
| #if 0 |
| vsContext_lo.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 0); |
| vsContext_hi.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 1); |
| #else |
| vsContext_lo.VertexID = fetchInfo_lo.VertexID; |
| vsContext_hi.VertexID = fetchInfo_lo.VertexID2; |
| #endif |
| #else |
| vsContext_lo.VertexID = fetchInfo_lo.VertexID; |
| vsContext_hi.VertexID = fetchInfo_hi.VertexID; |
| #endif |
| |
| // Setup active mask for vertex shader. |
| vsContext_lo.mask = GenerateMask(endVertex - i); |
| vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); |
| |
| // forward cut mask to the PA |
| if (IsIndexedT::value) |
| { |
| #if USE_SIMD16_SHADERS |
| #if 0 |
| *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 0))); |
| *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 1))); |
| #else |
| *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); |
| *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2)); |
| #endif |
| #else |
| *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); |
| *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); |
| #endif |
| } |
| |
| UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_FETCH) |
| #endif |
| { |
| AR_BEGIN(FEVertexShader, pDC->drawId); |
| state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); |
| |
| if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH |
| { |
| state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi); |
| } |
| AR_END(FEVertexShader, 0); |
| |
| UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); |
| } |
| } |
| |
| // 2. Assemble primitives given the last two SIMD. |
| do |
| { |
| simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM]; |
| |
| RDTSC_START(FEPAAssemble); |
| bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16); |
| RDTSC_STOP(FEPAAssemble, 1, 0); |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_FETCH) |
| #endif |
| { |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_VS) |
| #endif |
| { |
| if (assemble) |
| { |
| UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); |
| |
| const uint32_t numPrims = pa.NumPrims(); |
| const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); |
| const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; |
| |
| const simd16scalari primID = pa.GetPrimID(work.startPrimID); |
| const simdscalari primID_lo = _simd16_extract_si(primID, 0); |
| const simdscalari primID_hi = _simd16_extract_si(primID, 1); |
| |
| if (HasTessellationT::value) |
| { |
| pa.useAlternateOffset = false; |
| TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); |
| |
| if (numPrims_hi) |
| { |
| pa.useAlternateOffset = true; |
| TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); |
| } |
| } |
| else if (HasGeometryShaderT::value) |
| { |
| pa.useAlternateOffset = false; |
| GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo); |
| |
| if (numPrims_hi) |
| { |
| pa.useAlternateOffset = true; |
| GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi); |
| } |
| } |
| else |
| { |
| // If streamout is enabled then stream vertices out to memory. |
| if (HasStreamOutT::value) |
| { |
| pa.useAlternateOffset = false; |
| StreamOut(pDC, pa, workerId, pSoPrimData, 0); |
| } |
| |
| if (HasRastT::value) |
| { |
| SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); |
| { |
| pa.useAlternateOffset = false; |
| pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID); |
| } |
| } |
| } |
| } |
| } |
| } |
| } while (pa.NextPrim()); |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize); |
| fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize); |
| } |
| else |
| { |
| vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH)); |
| } |
| |
| i += KNOB_SIMD16_WIDTH; |
| } |
| |
| pa.Reset(); |
| } |
| |
| #else |
| SWR_VS_CONTEXT vsContext; |
| SWR_FETCH_CONTEXT fetchInfo = { 0 }; |
| |
| fetchInfo.pStreams = &state.vertexBuffers[0]; |
| fetchInfo.StartInstance = work.startInstance; |
| fetchInfo.StartVertex = 0; |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo.BaseVertex = work.baseVertex; |
| |
| // if the entire index buffer isn't being consumed, set the last index |
| // so that fetches < a SIMD wide will be masked off |
| fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); |
| if (pLastRequestedIndex < fetchInfo.pLastIndex) |
| { |
| fetchInfo.pLastIndex = pLastRequestedIndex; |
| } |
| } |
| else |
| { |
| fetchInfo.StartVertex = work.startVertex; |
| } |
| |
| const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); |
| |
| /// @todo: temporarily move instance loop in the FE to ensure SO ordering |
| for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) |
| { |
| simdscalari vIndex; |
| uint32_t i = 0; |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo.pIndices = work.pIB; |
| } |
| else |
| { |
| vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale); |
| fetchInfo.pIndices = (const int32_t*)&vIndex; |
| } |
| |
| fetchInfo.CurInstance = instanceNum; |
| vsContext.InstanceID = instanceNum; |
| |
| while (pa.HasWork()) |
| { |
| // GetNextVsOutput currently has the side effect of updating some PA state machine state. |
| // So we need to keep this outside of (i < endVertex) check. |
| simdmask* pvCutIndices = nullptr; |
| if (IsIndexedT::value) |
| { |
| pvCutIndices = &pa.GetNextVsIndices(); |
| } |
| |
| simdvertex& vout = pa.GetNextVsOutput(); |
| vsContext.pVin = &vout; |
| vsContext.pVout = &vout; |
| |
| if (i < endVertex) |
| { |
| |
| // 1. Execute FS/VS for a single SIMD. |
| AR_BEGIN(FEFetchShader, pDC->drawId); |
| state.pfnFetchFunc(fetchInfo, vout); |
| AR_END(FEFetchShader, 0); |
| |
| // forward fetch generated vertex IDs to the vertex shader |
| vsContext.VertexID = fetchInfo.VertexID; |
| |
| // Setup active mask for vertex shader. |
| vsContext.mask = GenerateMask(endVertex - i); |
| |
| // forward cut mask to the PA |
| if (IsIndexedT::value) |
| { |
| *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask)); |
| } |
| |
| UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_FETCH) |
| #endif |
| { |
| AR_BEGIN(FEVertexShader, pDC->drawId); |
| state.pfnVertexFunc(GetPrivateState(pDC), &vsContext); |
| AR_END(FEVertexShader, 0); |
| |
| UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); |
| } |
| } |
| |
| // 2. Assemble primitives given the last two SIMD. |
| do |
| { |
| simdvector prim[MAX_NUM_VERTS_PER_PRIM]; |
| // PaAssemble returns false if there is not enough verts to assemble. |
| AR_BEGIN(FEPAAssemble, pDC->drawId); |
| bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim); |
| AR_END(FEPAAssemble, 1); |
| |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_FETCH) |
| #endif |
| { |
| #if KNOB_ENABLE_TOSS_POINTS |
| if (!KNOB_TOSS_VS) |
| #endif |
| { |
| if (assemble) |
| { |
| UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); |
| |
| if (HasTessellationT::value) |
| { |
| TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>( |
| pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); |
| } |
| else if (HasGeometryShaderT::value) |
| { |
| GeometryShaderStage<HasStreamOutT, HasRastT>( |
| pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID)); |
| } |
| else |
| { |
| // If streamout is enabled then stream vertices out to memory. |
| if (HasStreamOutT::value) |
| { |
| StreamOut(pDC, pa, workerId, pSoPrimData, 0); |
| } |
| |
| if (HasRastT::value) |
| { |
| SWR_ASSERT(pDC->pState->pfnProcessPrims); |
| |
| pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, |
| GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); |
| } |
| } |
| } |
| } |
| } |
| } while (pa.NextPrim()); |
| |
| if (IsIndexedT::value) |
| { |
| fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); |
| } |
| else |
| { |
| vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); |
| } |
| |
| i += KNOB_SIMD_WIDTH; |
| } |
| pa.Reset(); |
| } |
| |
| #endif |
| |
| AR_END(FEProcessDraw, numPrims * work.numInstances); |
| } |
| |
| struct FEDrawChooser |
| { |
| typedef PFN_FE_WORK_FUNC FuncType; |
| |
| template <typename... ArgsB> |
| static FuncType GetFunc() |
| { |
| return ProcessDraw<ArgsB...>; |
| } |
| }; |
| |
| |
| // Selector for correct templated Draw front-end function |
| PFN_FE_WORK_FUNC GetProcessDrawFunc( |
| bool IsIndexed, |
| bool IsCutIndexEnabled, |
| bool HasTessellation, |
| bool HasGeometryShader, |
| bool HasStreamOut, |
| bool HasRasterization) |
| { |
| return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization); |
| } |