src/gallium/drivers/swr/rasterizer/core/frontend.cpp - platform/external/mesa3d - Gitiles

 /****************************************************************************
 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 * @file frontend.cpp
 *
 * @brief Implementation for Frontend which handles vertex processing,
 *        primitive assembly, clipping, binning, etc.
 *
 ******************************************************************************/

 #include "api.h"
 #include "frontend.h"
 #include "backend.h"
 #include "context.h"
 #include "rdtsc_core.h"
 #include "utils.h"
 #include "threads.h"
 #include "pa.h"
 #include "clip.h"
 #include "tilemgr.h"
 #include "tessellator.h"
 #include <limits>
 #include <iostream>

 //////////////////////////////////////////////////////////////////////////
 /// @brief Helper macro to generate a bitmask
 static INLINE uint32_t GenMask(uint32_t numBits)
 {
     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
     return ((1U << numBits) - 1);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrSync.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to sync callback.
 /// @todo This should go away when we switch this to use compute threading.
 void ProcessSync(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     BE_WORK work;
     work.type = SYNC;
     work.pfnWork = ProcessSyncBE;

     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     pTileMgr->enqueue(0, 0, &work);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrDestroyContext.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to sync callback.
 void ProcessShutdown(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     BE_WORK work;
     work.type = SHUTDOWN;
     work.pfnWork = ProcessShutdownBE;

     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     // Enqueue at least 1 work item for each worker thread
     // account for number of numa nodes
     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;

     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
     {
         for (uint32_t n = 0; n < numNumaNodes; ++n)
         {
             pTileMgr->enqueue(i, n, &work);
         }
     }
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrClearRenderTarget.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to clear callback.
 /// @todo This should go away when we switch this to use compute threading.
 void ProcessClear(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;

     // queue a clear to each macro tile
     // compute macro tile bounds for the specified rect
     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;

     BE_WORK work;
     work.type = CLEAR;
     work.pfnWork = ProcessClearBE;
     work.desc.clear = *pDesc;

     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
     {
         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
         {
             pTileMgr->enqueue(x, y, &work);
         }
     }
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrStoreTiles.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
 void ProcessStoreTiles(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;

     // queue a store to each macro tile
     // compute macro tile bounds for the specified rect
     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;

     // store tiles
     BE_WORK work;
     work.type = STORETILES;
     work.pfnWork = ProcessStoreTilesBE;
     work.desc.storeTiles = *pDesc;

     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
     {
         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
         {
             pTileMgr->enqueue(x, y, &work);
         }
     }

     AR_END(FEProcessStoreTiles, 0);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrInvalidateTiles.
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
 void ProcessDiscardInvalidateTiles(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;

     // compute macro tile bounds for the specified rect
     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;

     if (pDesc->fullTilesOnly == false)
     {
         // include partial tiles
         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
     }

     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);

     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);

     // load tiles
     BE_WORK work;
     work.type = DISCARDINVALIDATETILES;
     work.pfnWork = ProcessDiscardInvalidateTilesBE;
     work.desc.discardInvalidateTiles = *pDesc;

     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
     {
         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
         {
             pTileMgr->enqueue(x, y, &work);
         }
     }

     AR_END(FEProcessInvalidateTiles, 0);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes the number of primitives given the number of verts.
 /// @param mode - primitive topology for draw operation.
 /// @param numPrims - number of vertices or indices for draw.
 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 uint32_t GetNumPrims(
     PRIMITIVE_TOPOLOGY mode,
     uint32_t numPrims)
 {
     switch (mode)
     {
     case TOP_POINT_LIST: return numPrims;
     case TOP_TRIANGLE_LIST: return numPrims / 3;
     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
     case TOP_QUAD_LIST: return numPrims / 4;
     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
     case TOP_LINE_LIST: return numPrims / 2;
     case TOP_LINE_LOOP: return numPrims;
     case TOP_RECT_LIST: return numPrims / 3;
     case TOP_LINE_LIST_ADJ: return numPrims / 4;
     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
     case TOP_TRI_LIST_ADJ: return numPrims / 6;
     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;

     case TOP_PATCHLIST_1:
     case TOP_PATCHLIST_2:
     case TOP_PATCHLIST_3:
     case TOP_PATCHLIST_4:
     case TOP_PATCHLIST_5:
     case TOP_PATCHLIST_6:
     case TOP_PATCHLIST_7:
     case TOP_PATCHLIST_8:
     case TOP_PATCHLIST_9:
     case TOP_PATCHLIST_10:
     case TOP_PATCHLIST_11:
     case TOP_PATCHLIST_12:
     case TOP_PATCHLIST_13:
     case TOP_PATCHLIST_14:
     case TOP_PATCHLIST_15:
     case TOP_PATCHLIST_16:
     case TOP_PATCHLIST_17:
     case TOP_PATCHLIST_18:
     case TOP_PATCHLIST_19:
     case TOP_PATCHLIST_20:
     case TOP_PATCHLIST_21:
     case TOP_PATCHLIST_22:
     case TOP_PATCHLIST_23:
     case TOP_PATCHLIST_24:
     case TOP_PATCHLIST_25:
     case TOP_PATCHLIST_26:
     case TOP_PATCHLIST_27:
     case TOP_PATCHLIST_28:
     case TOP_PATCHLIST_29:
     case TOP_PATCHLIST_30:
     case TOP_PATCHLIST_31:
     case TOP_PATCHLIST_32:
         return numPrims / (mode - TOP_PATCHLIST_BASE);

     case TOP_POLYGON:
     case TOP_POINT_LIST_BF:
     case TOP_LINE_STRIP_CONT:
     case TOP_LINE_STRIP_BF:
     case TOP_LINE_STRIP_CONT_BF:
     case TOP_TRIANGLE_FAN_NOSTIPPLE:
     case TOP_TRI_STRIP_REVERSE:
     case TOP_PATCHLIST_BASE:
     case TOP_UNKNOWN:
         SWR_INVALID("Unsupported topology: %d", mode);
         return 0;
     }

     return 0;
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes the number of verts given the number of primitives.
 /// @param mode - primitive topology for draw operation.
 /// @param numPrims - number of primitives for draw.
 uint32_t GetNumVerts(
     PRIMITIVE_TOPOLOGY mode,
     uint32_t numPrims)
 {
     switch (mode)
     {
     case TOP_POINT_LIST: return numPrims;
     case TOP_TRIANGLE_LIST: return numPrims * 3;
     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
     case TOP_QUAD_LIST: return numPrims * 4;
     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
     case TOP_LINE_LIST: return numPrims * 2;
     case TOP_LINE_LOOP: return numPrims;
     case TOP_RECT_LIST: return numPrims * 3;
     case TOP_LINE_LIST_ADJ: return numPrims * 4;
     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
     case TOP_TRI_LIST_ADJ: return numPrims * 6;
     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;

     case TOP_PATCHLIST_1:
     case TOP_PATCHLIST_2:
     case TOP_PATCHLIST_3:
     case TOP_PATCHLIST_4:
     case TOP_PATCHLIST_5:
     case TOP_PATCHLIST_6:
     case TOP_PATCHLIST_7:
     case TOP_PATCHLIST_8:
     case TOP_PATCHLIST_9:
     case TOP_PATCHLIST_10:
     case TOP_PATCHLIST_11:
     case TOP_PATCHLIST_12:
     case TOP_PATCHLIST_13:
     case TOP_PATCHLIST_14:
     case TOP_PATCHLIST_15:
     case TOP_PATCHLIST_16:
     case TOP_PATCHLIST_17:
     case TOP_PATCHLIST_18:
     case TOP_PATCHLIST_19:
     case TOP_PATCHLIST_20:
     case TOP_PATCHLIST_21:
     case TOP_PATCHLIST_22:
     case TOP_PATCHLIST_23:
     case TOP_PATCHLIST_24:
     case TOP_PATCHLIST_25:
     case TOP_PATCHLIST_26:
     case TOP_PATCHLIST_27:
     case TOP_PATCHLIST_28:
     case TOP_PATCHLIST_29:
     case TOP_PATCHLIST_30:
     case TOP_PATCHLIST_31:
     case TOP_PATCHLIST_32:
         return numPrims * (mode - TOP_PATCHLIST_BASE);

     case TOP_POLYGON:
     case TOP_POINT_LIST_BF:
     case TOP_LINE_STRIP_CONT:
     case TOP_LINE_STRIP_BF:
     case TOP_LINE_STRIP_CONT_BF:
     case TOP_TRIANGLE_FAN_NOSTIPPLE:
     case TOP_TRI_STRIP_REVERSE:
     case TOP_PATCHLIST_BASE:
     case TOP_UNKNOWN:
         SWR_INVALID("Unsupported topology: %d", mode);
         return 0;
     }

     return 0;
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Return number of verts per primitive.
 /// @param topology - topology
 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 {
     uint32_t numVerts = 0;
     switch (topology)
     {
     case TOP_POINT_LIST:
     case TOP_POINT_LIST_BF:
         numVerts = 1;
         break;
     case TOP_LINE_LIST:
     case TOP_LINE_STRIP:
     case TOP_LINE_LIST_ADJ:
     case TOP_LINE_LOOP:
     case TOP_LINE_STRIP_CONT:
     case TOP_LINE_STRIP_BF:
     case TOP_LISTSTRIP_ADJ:
         numVerts = 2;
         break;
     case TOP_TRIANGLE_LIST:
     case TOP_TRIANGLE_STRIP:
     case TOP_TRIANGLE_FAN:
     case TOP_TRI_LIST_ADJ:
     case TOP_TRI_STRIP_ADJ:
     case TOP_TRI_STRIP_REVERSE:
     case TOP_RECT_LIST:
         numVerts = 3;
         break;
     case TOP_QUAD_LIST:
     case TOP_QUAD_STRIP:
         numVerts = 4;
         break;
     case TOP_PATCHLIST_1:
     case TOP_PATCHLIST_2:
     case TOP_PATCHLIST_3:
     case TOP_PATCHLIST_4:
     case TOP_PATCHLIST_5:
     case TOP_PATCHLIST_6:
     case TOP_PATCHLIST_7:
     case TOP_PATCHLIST_8:
     case TOP_PATCHLIST_9:
     case TOP_PATCHLIST_10:
     case TOP_PATCHLIST_11:
     case TOP_PATCHLIST_12:
     case TOP_PATCHLIST_13:
     case TOP_PATCHLIST_14:
     case TOP_PATCHLIST_15:
     case TOP_PATCHLIST_16:
     case TOP_PATCHLIST_17:
     case TOP_PATCHLIST_18:
     case TOP_PATCHLIST_19:
     case TOP_PATCHLIST_20:
     case TOP_PATCHLIST_21:
     case TOP_PATCHLIST_22:
     case TOP_PATCHLIST_23:
     case TOP_PATCHLIST_24:
     case TOP_PATCHLIST_25:
     case TOP_PATCHLIST_26:
     case TOP_PATCHLIST_27:
     case TOP_PATCHLIST_28:
     case TOP_PATCHLIST_29:
     case TOP_PATCHLIST_30:
     case TOP_PATCHLIST_31:
     case TOP_PATCHLIST_32:
         numVerts = topology - TOP_PATCHLIST_BASE;
         break;
     default:
         SWR_INVALID("Unsupported topology: %d", topology);
         break;
     }

     if (includeAdjVerts)
     {
         switch (topology)
         {
         case TOP_LISTSTRIP_ADJ:
         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
         case TOP_TRI_STRIP_ADJ:
         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
         default: break;
         }
     }

     return numVerts;
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Generate mask from remaining work.
 /// @param numWorkItems - Number of items being worked on by a SIMD.
 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 {
     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
     return _simd_castps_si(_simd_vmask_ps(mask));
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief StreamOut - Streams vertex data out to SO buffers.
 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 static void StreamOut(
     DRAW_CONTEXT* pDC,
     PA_STATE& pa,
     uint32_t workerId,
     uint32_t* pPrimData,
     uint32_t streamIndex)
 {
     SWR_CONTEXT *pContext = pDC->pContext;

     AR_BEGIN(FEStreamout, pDC->drawId);

     const API_STATE& state = GetApiState(pDC);
     const SWR_STREAMOUT_STATE &soState = state.soState;

     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);

     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);

     SWR_STREAMOUT_CONTEXT soContext = { 0 };

     // Setup buffer state pointers.
     for (uint32_t i = 0; i < 4; ++i)
     {
         soContext.pBuffer[i] = &state.soBuffer[i];
     }

     uint32_t numPrims = pa.NumPrims();

     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
     {
         DWORD slot = 0;
         uint32_t soMask = soState.streamMasks[streamIndex];

         // Write all entries into primitive data buffer for SOS.
         while (_BitScanForward(&slot, soMask))
         {
             simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
             uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
             pa.AssembleSingle(paSlot, primIndex, attrib);

             // Attribute offset is relative offset from start of vertex.
             // Note that attributes start at slot 1 in the PA buffer. We need to write this
             // to prim data starting at slot 0. Which is why we do (slot - 1).
             // Also note: GL works slightly differently, and needs slot 0
             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);

             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
             {
                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);

                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
             }

             soMask &= ~(1 << slot);
         }

         // Update pPrimData pointer
         soContext.pPrimData = pPrimData;

         // Call SOS
         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
         state.pfnSoFunc[streamIndex](soContext);
     }

     // Update SO write offset. The driver provides memory for the update.
     for (uint32_t i = 0; i < 4; ++i)
     {
         if (state.soBuffer[i].pWriteOffset)
         {
             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
         }

         if (state.soBuffer[i].soWriteEnable)
         {
             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
             pDC->dynState.SoWriteOffsetDirty[i] = true;
         }
     }

     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);

     AR_END(FEStreamout, 1);
 }

 #if USE_SIMD16_FRONTEND
 //////////////////////////////////////////////////////////////////////////
 /// Is value an even number (a multiple of two)
 ///
 template <typename T>
 INLINE static bool IsEven(T value)
 {
     return (value & 1) == 0;
 }

 //////////////////////////////////////////////////////////////////////////
 /// Round up value to an even number (a multiple of two)
 ///
 template <typename T>
 INLINE static T RoundUpEven(T value)
 {
     return (value + 1) & ~1;
 }

 //////////////////////////////////////////////////////////////////////////
 /// Round down value to an even number (a multiple of two)
 ///
 template <typename T>
 INLINE static T RoundDownEven(T value)
 {
     return value & ~1;
 }

 //////////////////////////////////////////////////////////////////////////
 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
 ///
 /// vertexCount is in terms of the source simdvertexes and must be even
 ///
 /// attribCount will limit the vector copies to those attribs specified
 ///
 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 ///
 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
 {
     SWR_ASSERT(vertex);
     SWR_ASSERT(vertex_simd16);
     SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);

     simd16vertex temp;

     for (uint32_t i = 0; i < vertexCount; i += 2)
     {
         for (uint32_t j = 0; j < attribCount; j += 1)
         {
             for (uint32_t k = 0; k < 4; k += 1)
             {
                 temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);

                 if ((i + 1) < vertexCount)
                 {
                     temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
                 }
             }
         }

         for (uint32_t j = 0; j < attribCount; j += 1)
         {
             vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
         }
     }
 }

 #endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes number of invocations. The current index represents
 ///        the start of the SIMD. The max index represents how much work
 ///        items are remaining. If there is less then a SIMD's xmin of work
 ///        then return the remaining amount of work.
 /// @param curIndex - The start index for the SIMD.
 /// @param maxIndex - The last index for all work items.
 static INLINE uint32_t GetNumInvocations(
     uint32_t curIndex,
     uint32_t maxIndex)
 {
     uint32_t remainder = (maxIndex - curIndex);
 #if USE_SIMD16_FRONTEND
     return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
 #else
     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 #endif
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 ///        The geometry shader will loop over each active streamout buffer, assembling
 ///        primitives for the downstream stages. When multistream output is enabled,
 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 ///        buffer for the primitive assembler.
 /// @param stream - stream id to generate the cut buffer for
 /// @param pStreamIdBase - pointer to the stream ID buffer
 /// @param numEmittedVerts - Number of total verts emitted by the GS
 /// @param pCutBuffer - output buffer to write cuts to
 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 {
     SWR_ASSERT(stream < MAX_SO_STREAMS);

     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);

     for (uint32_t b = 0; b < numOutputBytes; ++b)
     {
         uint8_t curInputByte = pStreamIdBase[2*b];
         uint8_t outByte = 0;
         for (uint32_t i = 0; i < 4; ++i)
         {
             if ((curInputByte & 0x3) != stream)
             {
                 outByte |= (1 << i);
             }
             curInputByte >>= 2;
         }

         curInputByte = pStreamIdBase[2 * b + 1];
         for (uint32_t i = 0; i < 4; ++i)
         {
             if ((curInputByte & 0x3) != stream)
             {
                 outByte |= (1 << (i + 4));
             }
             curInputByte >>= 2;
         }

         *pCutBuffer++ = outByte;
     }
 }

 // Buffers that are allocated if GS is enabled
 struct GsBuffers
 {
     uint8_t* pGsIn;
     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
     uint8_t* pGsTransposed;
     void* pStreamCutBuffer;
 };

 //////////////////////////////////////////////////////////////////////////
 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
 /// @param numVerts - Number of vertices outputted by the GS
 /// @param numAttribs - Number of attributes per vertex
 template<typename SIMD_T, uint32_t SimdWidth>
 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
 {
     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
     uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;

     OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];

     for (uint32_t i = 0; i < SimdWidth; ++i)
     {
         gatherOffsets[i] = srcVertexStride * i;
     }
     auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);

     uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
     uint32_t remainingVerts = numVerts;

     for (uint32_t s = 0; s < numSimd; ++s)
     {
         uint8_t* pSrcBase = pSrc + s * srcVertexStride * SimdWidth;
         uint8_t* pDstBase = pDst + s * dstVertexStride;

         // Compute mask to prevent src overflow
         uint32_t mask = std::min(remainingVerts, SimdWidth);
         mask = GenMask(mask);
         auto vMask = SIMD_T::vmask_ps(mask);
         auto viMask = SIMD_T::castps_si(vMask);

         for (uint32_t a = 0; a < numAttribs; ++a)
         {
             auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
             auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
             auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
             auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);

             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);

             pSrcBase += sizeof(float) * 4;
             pDstBase += sizeof(typename SIMD_T::Float) * 4;
         }
         remainingVerts -= SimdWidth;
     }
 }


 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pa - The primitive assembly object.
 /// @param pGsOut - output stream for GS
 template <
     typename HasStreamOutT,
     typename HasRastT>
 static void GeometryShaderStage(
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     PA_STATE& pa,
     GsBuffers* pGsBuffers,
     uint32_t* pSoPrimData,
 #if USE_SIMD16_FRONTEND
     uint32_t numPrims_simd8,
 #endif
     simdscalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;

     AR_BEGIN(FEGeometryShader, pDC->drawId);

     const API_STATE& state = GetApiState(pDC);
     const SWR_GS_STATE* pState = &state.gsState;
     SWR_GS_CONTEXT gsContext;

     static uint8_t sNullBuffer[128] = { 0 };

     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
     {
         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
     }
     gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
     gsContext.PrimitiveID = primID;

     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];

     // assemble all attributes for the input primitive
     gsContext.inputVertStride = pState->inputVertStride;
     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
     {
         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
         uint32_t attribSlot = pState->vertexAttribOffset + slot;
         pa.Assemble(srcAttribSlot, attrib);

         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
             gsContext.pVerts[attribSlot + pState->inputVertStride * i] = attrib[i];
         }
     }

     // assemble position
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
     {
         gsContext.pVerts[VERTEX_POSITION_SLOT + pState->inputVertStride * i] = attrib[i];
     }

     // record valid prims from the frontend to avoid over binning the newly generated
     // prims from the GS
 #if USE_SIMD16_FRONTEND
     uint32_t numInputPrims = numPrims_simd8;
 #else
     uint32_t numInputPrims = pa.NumPrims();
 #endif

     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
         gsContext.InstanceID = instance;
         gsContext.mask = GenerateMask(numInputPrims);

         // execute the geometry shader
         state.pfnGsFunc(GetPrivateState(pDC), &gsContext);

         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
         {
             gsContext.pStreams[i] += pState->allocationSize;
         }
     }

     // set up new binner and state for the GS output topology
 #if USE_SIMD16_FRONTEND
     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
         switch (pState->outputTopology)
         {
         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
         }
     }

 #else
     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
         switch (pState->outputTopology)
         {
         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
         default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
         }
     }

 #endif
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
     uint32_t* pPrimitiveId = (uint32_t*)&primID;

     uint32_t totalPrimsGenerated = 0;
     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
     {
         uint8_t* pInstanceBase = (uint8_t*)pGsBuffers->pGsOut[inputPrim];

         // Vertex count is either emitted by shader or static
         uint32_t vertexCount = 0;
         if (pState->staticVertexCount)
         {
             vertexCount = pState->staticVertexCount;
         }
         else
         {
             // If emitted in shader, it should be the stored in the first dword of the output buffer
             vertexCount = *(uint32_t*)pInstanceBase;
         }

         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
         {
             uint32_t numEmittedVerts = vertexCount;
             if (numEmittedVerts == 0)
             {
                 continue;
             }

             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
             uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;

 #if USE_SIMD16_FRONTEND
             TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
 #else
             TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
 #endif

             uint32_t numAttribs = state.feNumAttributes;

             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
             {
                 bool processCutVerts = false;
                 uint8_t* pCutBuffer = pCutBase;

                 // assign default stream ID, only relevant when GS is outputting a single stream
                 uint32_t streamID = 0;
                 if (pState->isSingleStream)
                 {
                     processCutVerts = true;
                     streamID = pState->singleStreamID;
                     if (streamID != stream) continue;
                 }
                 else
                 {
                     // early exit if this stream is not enabled for streamout
                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
                     {
                         continue;
                     }

                     // multi-stream output, need to translate StreamID buffer to a cut buffer
                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
                     pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
                     processCutVerts = false;
                 }

 #if USE_SIMD16_FRONTEND
                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);

 #else
                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);

 #endif
                 while (gsPa.GetNextStreamOutput())
                 {
                     do
                     {
 #if USE_SIMD16_FRONTEND
                         simd16vector attrib_simd16[3];

                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);

 #else
                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);

 #endif
                         if (assemble)
                         {
                             totalPrimsGenerated += gsPa.NumPrims();

                             if (HasStreamOutT::value)
                             {
 #if ENABLE_AVX512_SIMD16
                                 gsPa.useAlternateOffset = false;
 #endif
                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
                             }

                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
                             {
 #if USE_SIMD16_FRONTEND
                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
                                 {
                                     gsPa.useAlternateOffset = false;
                                     pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
                                 }
 #else
                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 #endif
                             }
                         }
                     } while (gsPa.NextPrim());
                 }
             }
         }
     }

     // update GS pipeline stats
     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
     AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
     AR_END(FEGeometryShader, 1);
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Allocate GS buffers
 /// @param pDC - pointer to draw context.
 /// @param state - API state
 /// @param ppGsOut - pointer to GS output buffer allocation
 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 template<typename SIMD_T, uint32_t SIMD_WIDTH>
 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
 {
     auto pArena = pDC->pArena;
     SWR_ASSERT(pArena != nullptr);
     SWR_ASSERT(state.gsState.gsEnable);

     const SWR_GS_STATE& gsState = state.gsState;

     // Allocate storage for vertex inputs
     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
     pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);

     // Allocate arena space to hold GS output verts
     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;

     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
     {
         pGsBuffers->pGsOut[i] = (uint8_t*)pArena->AllocAligned(vertexBufferSize, 32);
     }

     // Allocate storage for transposed GS output
     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
     uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);

     // Allocate storage to hold temporary stream->cut buffer, if necessary
     if (state.gsState.isSingleStream)
     {
         pGsBuffers->pStreamCutBuffer = nullptr;
     }
     else
     {
         pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
     }
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Contains all data generated by the HS and passed to the
 /// tessellator and DS.
 struct TessellationThreadLocalData
 {
     SWR_HS_CONTEXT hsContext;
     ScalarPatch patchData[KNOB_SIMD_WIDTH];
     void* pTxCtx;
     size_t tsCtxSize;

     simdscalar* pDSOutput;
     size_t dsOutputAllocSize;
 };

 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;

 //////////////////////////////////////////////////////////////////////////
 /// @brief Allocate tessellation data for this worker thread.
 INLINE
 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 {
     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
     if (gt_pTessellationThreadData == nullptr)
     {
         gt_pTessellationThreadData = (TessellationThreadLocalData*)
             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
     }
 }

 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements Tessellation Stages.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pa - The primitive assembly object.
 /// @param pGsOut - output stream for GS
 template <
     typename HasGeometryShaderT,
     typename HasStreamOutT,
     typename HasRastT>
 static void TessellationStages(
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     PA_STATE& pa,
     GsBuffers* pGsBuffers,
     uint32_t* pSoPrimData,
 #if USE_SIMD16_FRONTEND
     uint32_t numPrims_simd8,
 #endif
     simdscalari const &primID)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const SWR_TS_STATE& tsState = state.tsState;

     SWR_ASSERT(gt_pTessellationThreadData);

     HANDLE tsCtx = TSInitCtx(
         tsState.domain,
         tsState.partitioning,
         tsState.tsOutputTopology,
         gt_pTessellationThreadData->pTxCtx,
         gt_pTessellationThreadData->tsCtxSize);
     if (tsCtx == nullptr)
     {
         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
         tsCtx = TSInitCtx(
             tsState.domain,
             tsState.partitioning,
             tsState.tsOutputTopology,
             gt_pTessellationThreadData->pTxCtx,
             gt_pTessellationThreadData->tsCtxSize);
     }
     SWR_ASSERT(tsCtx);

 #if USE_SIMD16_FRONTEND
     PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
         switch (tsState.postDSTopology)
         {
         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
         case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
         }
     }

 #else
     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
         switch (tsState.postDSTopology)
         {
         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
         default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
         }
     }

 #endif
     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
     hsContext.pCPout = gt_pTessellationThreadData->patchData;
     hsContext.PrimitiveID = primID;

     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
     // Max storage for one attribute for an entire simdprimitive
     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];

     // assemble all attributes for the input primitives
     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
     {
         uint32_t attribSlot = tsState.vertexAttribOffset + slot;
         pa.Assemble(attribSlot, simdattrib);

         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
             hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
         }
     }

 #if defined(_DEBUG)
     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
 #endif

 #if USE_SIMD16_FRONTEND
     uint32_t numPrims = numPrims_simd8;
 #else
     uint32_t numPrims = pa.NumPrims();
 #endif
     hsContext.mask = GenerateMask(numPrims);

     // Run the HS
     AR_BEGIN(FEHullShader, pDC->drawId);
     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
     AR_END(FEHullShader, 0);

     UPDATE_STAT_FE(HsInvocations, numPrims);

     const uint32_t* pPrimId = (const uint32_t*)&primID;

     for (uint32_t p = 0; p < numPrims; ++p)
     {
         // Run Tessellator
         SWR_TS_TESSELLATED_DATA tsData = { 0 };
         AR_BEGIN(FETessellation, pDC->drawId);
         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
         AR_EVENT(TessPrimCount(1));
         AR_END(FETessellation, 0);

         if (tsData.NumPrimitives == 0)
         {
             continue;
         }
         SWR_ASSERT(tsData.NumDomainPoints);

         // Allocate DS Output memory
         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
 #if USE_SIMD16_FRONTEND
         size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;      // simd8 -> simd16, padding
 #else
         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
 #endif
         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
         {
             AlignedFree(gt_pTessellationThreadData->pDSOutput);
             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
         }
         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
         SWR_ASSERT(gt_pTessellationThreadData->dsOutputAllocSize >= requiredAllocSize);

 #if defined(_DEBUG)
         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
 #endif

         // Run Domain Shader
         SWR_DS_CONTEXT dsContext;
         dsContext.PrimitiveID = pPrimId[p];
         dsContext.pCpIn = &hsContext.pCPout[p];
         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
 #if USE_SIMD16_FRONTEND
         dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
 #else
         dsContext.vectorStride = requiredDSVectorInvocations;
 #endif

         uint32_t dsInvocations = 0;

         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
         {
             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);

             AR_BEGIN(FEDomainShader, pDC->drawId);
             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
             AR_END(FEDomainShader, 0);

             dsInvocations += KNOB_SIMD_WIDTH;
         }
         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);

 #if USE_SIMD16_FRONTEND
         SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16

 #endif
         PA_TESS tessPa(
             pDC,
 #if USE_SIMD16_FRONTEND
             reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
             dsContext.vectorStride / 2,                                         // simd8 -> simd16
 #else
             dsContext.pOutputData,
             dsContext.vectorStride,
 #endif
             SWR_VTX_NUM_SLOTS,
             tsState.numDsOutputAttribs,
             tsData.ppIndices,
             tsData.NumPrimitives,
             tsState.postDSTopology,
             numVertsPerPrim);

         while (tessPa.HasWork())
         {
 #if USE_SIMD16_FRONTEND
             const uint32_t numPrims = tessPa.NumPrims();
             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;

             const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
             const simdscalari primID_hi = _simd16_extract_si(primID, 1);

 #endif
             if (HasGeometryShaderT::value)
             {
 #if USE_SIMD16_FRONTEND
                 tessPa.useAlternateOffset = false;
                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);

                 if (numPrims_hi)
                 {
                     tessPa.useAlternateOffset = true;
                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                 }
 #else
                 GeometryShaderStage<HasStreamOutT, HasRastT>(
                     pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
 #endif
             }
             else
             {
                 if (HasStreamOutT::value)
                 {
 #if ENABLE_AVX512_SIMD16
                     tessPa.useAlternateOffset = false;
 #endif
                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
                 }

                 if (HasRastT::value)
                 {
 #if USE_SIMD16_FRONTEND
                     simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
 #else
                     simdvector      prim[3];        // Only deal with triangles, lines, or points
 #endif
                     AR_BEGIN(FEPAAssemble, pDC->drawId);
                     bool assemble =
 #if USE_SIMD16_FRONTEND
                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
 #else
                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
 #endif
                     AR_END(FEPAAssemble, 1);
                     SWR_ASSERT(assemble);

                     SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND

                     {
                         tessPa.useAlternateOffset = false;
                         pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
                     }
 #else
                     pfnClipFunc(pDC, tessPa, workerId, prim,
                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
 #endif
                 }
             }

             tessPa.NextPrim();

         } // while (tessPa.HasWork())
     } // for (uint32_t p = 0; p < numPrims; ++p)

 #if USE_SIMD16_FRONTEND
     if (gt_pTessellationThreadData->pDSOutput != nullptr)
     {
         AlignedFree(gt_pTessellationThreadData->pDSOutput);
         gt_pTessellationThreadData->pDSOutput = nullptr;
     }
     gt_pTessellationThreadData->dsOutputAllocSize = 0;

 #endif
     TSDestroyCtx(tsCtx);
 }

 THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
 THREAD uint32_t gVertexStoreSize = 0;

 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrDraw.
 /// @tparam IsIndexedT - Is indexed drawing enabled
 /// @tparam HasTessellationT - Is tessellation enabled
 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
 /// @tparam HasStreamOutT - Is stream-out enabled
 /// @tparam HasRastT - Is rasterization enabled
 /// @param pContext - pointer to SWR context.
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id.
 /// @param pUserData - Pointer to DRAW_WORK
 template <
     typename IsIndexedT,
     typename IsCutIndexEnabledT,
     typename HasTessellationT,
     typename HasGeometryShaderT,
     typename HasStreamOutT,
     typename HasRastT>
 void ProcessDraw(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {

 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_QUEUE_FE)
     {
         return;
     }
 #endif

     AR_BEGIN(FEProcessDraw, pDC->drawId);

     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
     const API_STATE&    state = GetApiState(pDC);

     uint32_t indexSize = 0;
     uint32_t endVertex = work.numVerts;

     const int32_t* pLastRequestedIndex = nullptr;
     if (IsIndexedT::value)
     {
         switch (work.type)
         {
         case R32_UINT:
             indexSize = sizeof(uint32_t);
             pLastRequestedIndex = &(work.pIB[endVertex]);
             break;
         case R16_UINT:
             indexSize = sizeof(uint16_t);
             // nasty address offset to last index
             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
             break;
         case R8_UINT:
             indexSize = sizeof(uint8_t);
             // nasty address offset to last index
             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
             break;
         default:
             SWR_INVALID("Invalid work.type: %d", work.type);
         }
     }
     else
     {
         // No cuts, prune partial primitives.
         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
     }

 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
 #endif

     GsBuffers gsBuffers;
     if (HasGeometryShaderT::value)
     {
 #if USE_SIMD16_FRONTEND
         AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
 #else
         AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
 #endif
     }

     if (HasTessellationT::value)
     {
         SWR_ASSERT(state.tsState.tsEnable == true);
         SWR_ASSERT(state.pfnHsFunc != nullptr);
         SWR_ASSERT(state.pfnDsFunc != nullptr);

         AllocateTessellationData(pContext);
     }
     else
     {
         SWR_ASSERT(state.tsState.tsEnable == false);
         SWR_ASSERT(state.pfnHsFunc == nullptr);
         SWR_ASSERT(state.pfnDsFunc == nullptr);
     }

     // allocate space for streamout input prim data
     uint32_t* pSoPrimData = nullptr;
     if (HasStreamOutT::value)
     {
         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
     }

     const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
 #if USE_SIMD16_FRONTEND
     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
 #else
     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
 #endif

     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);

     // Compute storage requirements for vertex store
     // TODO: allocation needs to be rethought for better cut support
     uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;

     // grow the vertex store for the PA as necessary
     if (gVertexStoreSize < vertexStoreSize)
     {
         if (gpVertexStore != nullptr)
         {
             AlignedFree(gpVertexStore);
             gpVertexStore = nullptr;
         }

         SWR_ASSERT(gpVertexStore == nullptr);

         gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
         gVertexStoreSize = vertexStoreSize;

         SWR_ASSERT(gpVertexStore != nullptr);
     }

     // choose primitive assembler

     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
     PA_STATE& pa = paFactory.GetPA();

 #if USE_SIMD16_FRONTEND
 #if USE_SIMD16_SHADERS
     simd16vertex        vin;
 #else
     simdvertex          vin_lo;
     simdvertex          vin_hi;
 #endif
     SWR_VS_CONTEXT      vsContext_lo;
     SWR_VS_CONTEXT      vsContext_hi;

 #if USE_SIMD16_SHADERS
     vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
     vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
 #else
     vsContext_lo.pVin = &vin_lo;
     vsContext_hi.pVin = &vin_hi;
 #endif
     vsContext_lo.AlternateOffset = 0;
     vsContext_hi.AlternateOffset = 1;

     SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };

     fetchInfo_lo.pStreams = &state.vertexBuffers[0];
     fetchInfo_lo.StartInstance = work.startInstance;
     fetchInfo_lo.StartVertex = 0;

     if (IsIndexedT::value)
     {
         fetchInfo_lo.BaseVertex = work.baseVertex;

         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
         fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
         {
             fetchInfo_lo.pLastIndex = pLastRequestedIndex;
         }
     }
     else
     {
         fetchInfo_lo.StartVertex = work.startVertex;
     }

     SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;

     const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
     {
         uint32_t  i = 0;

         simd16scalari vIndex;

         if (IsIndexedT::value)
         {
             fetchInfo_lo.pIndices = work.pIB;
             fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize);    // 1/2 of KNOB_SIMD16_WIDTH
         }
         else
         {
             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);

             fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
             fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
         }

         fetchInfo_lo.CurInstance = instanceNum;
         fetchInfo_hi.CurInstance = instanceNum;

         vsContext_lo.InstanceID = instanceNum;
         vsContext_hi.InstanceID = instanceNum;

         while (pa.HasWork())
         {
             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
             // So we need to keep this outside of (i < endVertex) check.

             simdmask *pvCutIndices_lo = nullptr;
             simdmask *pvCutIndices_hi = nullptr;

             if (IsIndexedT::value)
             {
                 // simd16mask <=> simdmask[2]

                 pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
                 pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
             }

             simd16vertex &vout = pa.GetNextVsOutput();

             vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
             vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);

             if (i < endVertex)
             {
                 // 1. Execute FS/VS for a single SIMD.
                 AR_BEGIN(FEFetchShader, pDC->drawId);
 #if USE_SIMD16_SHADERS
                 state.pfnFetchFunc(fetchInfo_lo, vin);
 #else
                 state.pfnFetchFunc(fetchInfo_lo, vin_lo);

                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
                 {
                     state.pfnFetchFunc(fetchInfo_hi, vin_hi);
                 }
 #endif
                 AR_END(FEFetchShader, 0);

                 // forward fetch generated vertex IDs to the vertex shader
 #if USE_SIMD16_SHADERS
 #if 0
                 vsContext_lo.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 0);
                 vsContext_hi.VertexID = _simd16_extract(fetchInfo_lo.VertexID, 1);
 #else
                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
 #endif
 #else
                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
                 vsContext_hi.VertexID = fetchInfo_hi.VertexID;
 #endif

                 // Setup active mask for vertex shader.
                 vsContext_lo.mask = GenerateMask(endVertex - i);
                 vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));

                 // forward cut mask to the PA
                 if (IsIndexedT::value)
                 {
 #if USE_SIMD16_SHADERS
 #if 0
                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 0)));
                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(_simd16_extract(fetchInfo_lo.CutMask, 1)));
 #else
                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask2));
 #endif
 #else
                     *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
                     *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
 #endif
                 }

                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));

 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
                     AR_BEGIN(FEVertexShader, pDC->drawId);
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);

                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
                     {
                         state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
                     }
                     AR_END(FEVertexShader, 0);

                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
             }

             // 2. Assemble primitives given the last two SIMD.
             do
             {
                 simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];

                 RDTSC_START(FEPAAssemble);
                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim_simd16);
                 RDTSC_STOP(FEPAAssemble, 1, 0);

 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
 #if KNOB_ENABLE_TOSS_POINTS
                     if (!KNOB_TOSS_VS)
 #endif
                     {
                         if (assemble)
                         {
                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());

                             const uint32_t numPrims = pa.NumPrims();
                             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
                             const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;

                             const simd16scalari primID = pa.GetPrimID(work.startPrimID);
                             const simdscalari primID_lo = _simd16_extract_si(primID, 0);
                             const simdscalari primID_hi = _simd16_extract_si(primID, 1);

                             if (HasTessellationT::value)
                             {
                                 pa.useAlternateOffset = false;
                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);

                                 if (numPrims_hi)
                                 {
                                     pa.useAlternateOffset = true;
                                     TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                                 }
                             }
                             else if (HasGeometryShaderT::value)
                             {
                                 pa.useAlternateOffset = false;
                                 GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);

                                 if (numPrims_hi)
                                 {
                                     pa.useAlternateOffset = true;
                                     GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                                 }
                             }
                             else
                             {
                                 // If streamout is enabled then stream vertices out to memory.
                                 if (HasStreamOutT::value)
                                 {
                                     pa.useAlternateOffset = false;
                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
                                 }

                                 if (HasRastT::value)
                                 {
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
                                     {
                                         pa.useAlternateOffset = false;
                                         pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
                                     }
                                 }
                             }
                         }
                     }
                 }
             } while (pa.NextPrim());

             if (IsIndexedT::value)
             {
                 fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
                 fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
             }
             else
             {
                 vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
             }

             i += KNOB_SIMD16_WIDTH;
         }

         pa.Reset();
     }

 #else
     SWR_VS_CONTEXT      vsContext;
     SWR_FETCH_CONTEXT   fetchInfo = { 0 };

     fetchInfo.pStreams = &state.vertexBuffers[0];
     fetchInfo.StartInstance = work.startInstance;
     fetchInfo.StartVertex = 0;

     if (IsIndexedT::value)
     {
         fetchInfo.BaseVertex = work.baseVertex;

         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (pLastRequestedIndex < fetchInfo.pLastIndex)
         {
             fetchInfo.pLastIndex = pLastRequestedIndex;
         }
     }
     else
     {
         fetchInfo.StartVertex = work.startVertex;
     }

     const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);

     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
     {
         simdscalari vIndex;
         uint32_t  i = 0;

         if (IsIndexedT::value)
         {
             fetchInfo.pIndices = work.pIB;
         }
         else
         {
             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
             fetchInfo.pIndices = (const int32_t*)&vIndex;
         }

         fetchInfo.CurInstance = instanceNum;
         vsContext.InstanceID = instanceNum;

         while (pa.HasWork())
         {
             // GetNextVsOutput currently has the side effect of updating some PA state machine state.
             // So we need to keep this outside of (i < endVertex) check.
             simdmask* pvCutIndices = nullptr;
             if (IsIndexedT::value)
             {
                 pvCutIndices = &pa.GetNextVsIndices();
             }

             simdvertex& vout = pa.GetNextVsOutput();
             vsContext.pVin = &vout;
             vsContext.pVout = &vout;

             if (i < endVertex)
             {

                 // 1. Execute FS/VS for a single SIMD.
                 AR_BEGIN(FEFetchShader, pDC->drawId);
                 state.pfnFetchFunc(fetchInfo, vout);
                 AR_END(FEFetchShader, 0);

                 // forward fetch generated vertex IDs to the vertex shader
                 vsContext.VertexID = fetchInfo.VertexID;

                 // Setup active mask for vertex shader.
                 vsContext.mask = GenerateMask(endVertex - i);

                 // forward cut mask to the PA
                 if (IsIndexedT::value)
                 {
                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
                 }

                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));

 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
                     AR_BEGIN(FEVertexShader, pDC->drawId);
                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
                     AR_END(FEVertexShader, 0);

                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
                 }
             }

             // 2. Assemble primitives given the last two SIMD.
             do
             {
                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
                 // PaAssemble returns false if there is not enough verts to assemble.
                 AR_BEGIN(FEPAAssemble, pDC->drawId);
                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
                 AR_END(FEPAAssemble, 1);

 #if KNOB_ENABLE_TOSS_POINTS
                 if (!KNOB_TOSS_FETCH)
 #endif
                 {
 #if KNOB_ENABLE_TOSS_POINTS
                     if (!KNOB_TOSS_VS)
 #endif
                     {
                         if (assemble)
                         {
                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());

                             if (HasTessellationT::value)
                             {
                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
                             }
                             else if (HasGeometryShaderT::value)
                             {
                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
                                     pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
                             }
                             else
                             {
                                 // If streamout is enabled then stream vertices out to memory.
                                 if (HasStreamOutT::value)
                                 {
                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
                                 }

                                 if (HasRastT::value)
                                 {
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);

                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
                                 }
                             }
                         }
                     }
                 }
             } while (pa.NextPrim());

             if (IsIndexedT::value)
             {
                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
             }
             else
             {
                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
             }

             i += KNOB_SIMD_WIDTH;
         }
         pa.Reset();
     }

 #endif

     AR_END(FEProcessDraw, numPrims * work.numInstances);
 }

 struct FEDrawChooser
 {
     typedef PFN_FE_WORK_FUNC FuncType;

     template <typename... ArgsB>
     static FuncType GetFunc()
     {
         return ProcessDraw<ArgsB...>;
     }
 };


 // Selector for correct templated Draw front-end function
 PFN_FE_WORK_FUNC GetProcessDrawFunc(
     bool IsIndexed,
     bool IsCutIndexEnabled,
     bool HasTessellation,
     bool HasGeometryShader,
     bool HasStreamOut,
     bool HasRasterization)
 {
     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
 }