icd/intel/cmd_pipeline.c - platform/external/vulkan-validation-layers - Gitiles

 /*
  *
  * Copyright (C) 2015 Valve Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
  * Author: Chia-I Wu <olvaffe@gmail.com>
  * Author: Chia-I Wu <olv@lunarg.com>
  * Author: Cody Northrop <cody@lunarg.com>
  * Author: Courtney Goeltzenleuchter <courtney@LunarG.com>
  *
  */

 #include <math.h>
 #include "genhw/genhw.h"
 #include "buf.h"
 #include "desc.h"
 #include "img.h"
 #include "mem.h"
 #include "pipeline.h"
 #include "sampler.h"
 #include "shader.h"
 #include "state.h"
 #include "view.h"
 #include "cmd_priv.h"
 #include "fb.h"

 static void gen6_3DPRIMITIVE(struct intel_cmd *cmd,
                              int prim_type, bool indexed,
                              uint32_t vertex_count,
                              uint32_t vertex_start,
                              uint32_t instance_count,
                              uint32_t instance_start,
                              uint32_t vertex_base)
 {
     const uint8_t cmd_len = 6;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) |
           prim_type << GEN6_3DPRIM_DW0_TYPE__SHIFT |
           (cmd_len - 2);

     if (indexed)
         dw0 |= GEN6_3DPRIM_DW0_ACCESS_RANDOM;

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = vertex_count;
     dw[2] = vertex_start;
     dw[3] = instance_count;
     dw[4] = instance_start;
     dw[5] = vertex_base;
 }

 static void gen7_3DPRIMITIVE(struct intel_cmd *cmd,
                              int prim_type, bool indexed,
                              uint32_t vertex_count,
                              uint32_t vertex_start,
                              uint32_t instance_count,
                              uint32_t instance_start,
                              uint32_t vertex_base)
 {
     const uint8_t cmd_len = 7;
     uint32_t dw0, dw1, *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     dw0 = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2);
     dw1 = prim_type << GEN7_3DPRIM_DW1_TYPE__SHIFT;

     if (indexed)
         dw1 |= GEN7_3DPRIM_DW1_ACCESS_RANDOM;

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = dw1;
     dw[2] = vertex_count;
     dw[3] = vertex_start;
     dw[4] = instance_count;
     dw[5] = instance_start;
     dw[6] = vertex_base;
 }

 static void gen6_PIPE_CONTROL(struct intel_cmd *cmd, uint32_t dw1,
                               struct intel_bo *bo, uint32_t bo_offset,
                               uint64_t imm)
 {
    const uint8_t cmd_len = 5;
    const uint32_t dw0 = GEN6_RENDER_CMD(3D, PIPE_CONTROL) |
                         (cmd_len - 2);
    uint32_t reloc_flags = INTEL_RELOC_WRITE;
    uint32_t *dw;
    uint32_t pos;

    CMD_ASSERT(cmd, 6, 7.5);

    assert(bo_offset % 8 == 0);

    if (dw1 & GEN6_PIPE_CONTROL_CS_STALL) {
       /*
        * From the Sandy Bridge PRM, volume 2 part 1, page 73:
        *
        *     "1 of the following must also be set (when CS stall is set):
        *
        *       * Depth Cache Flush Enable ([0] of DW1)
        *       * Stall at Pixel Scoreboard ([1] of DW1)
        *       * Depth Stall ([13] of DW1)
        *       * Post-Sync Operation ([13] of DW1)
        *       * Render Target Cache Flush Enable ([12] of DW1)
        *       * Notify Enable ([8] of DW1)"
        *
        * From the Ivy Bridge PRM, volume 2 part 1, page 61:
        *
        *     "One of the following must also be set (when CS stall is set):
        *
        *       * Render Target Cache Flush Enable ([12] of DW1)
        *       * Depth Cache Flush Enable ([0] of DW1)
        *       * Stall at Pixel Scoreboard ([1] of DW1)
        *       * Depth Stall ([13] of DW1)
        *       * Post-Sync Operation ([13] of DW1)"
        */
       uint32_t bit_test = GEN6_PIPE_CONTROL_RENDER_CACHE_FLUSH |
                           GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                           GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL |
                           GEN6_PIPE_CONTROL_DEPTH_STALL;

       /* post-sync op */
       bit_test |= GEN6_PIPE_CONTROL_WRITE_IMM |
                   GEN6_PIPE_CONTROL_WRITE_PS_DEPTH_COUNT |
                   GEN6_PIPE_CONTROL_WRITE_TIMESTAMP;

       if (cmd_gen(cmd) == INTEL_GEN(6))
          bit_test |= GEN6_PIPE_CONTROL_NOTIFY_ENABLE;

       assert(dw1 & bit_test);
    }

    if (dw1 & GEN6_PIPE_CONTROL_DEPTH_STALL) {
       /*
        * From the Sandy Bridge PRM, volume 2 part 1, page 73:
        *
        *     "Following bits must be clear (when Depth Stall is set):
        *
        *       * Render Target Cache Flush Enable ([12] of DW1)
        *       * Depth Cache Flush Enable ([0] of DW1)"
        */
       assert(!(dw1 & (GEN6_PIPE_CONTROL_RENDER_CACHE_FLUSH |
                       GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
    }

    /*
     * From the Sandy Bridge PRM, volume 1 part 3, page 19:
     *
     *     "[DevSNB] PPGTT memory writes by MI_* (such as MI_STORE_DATA_IMM)
     *      and PIPE_CONTROL are not supported."
     *
     * The kernel will add the mapping automatically (when write domain is
     * INTEL_DOMAIN_INSTRUCTION).
     */
    if (cmd_gen(cmd) == INTEL_GEN(6) && bo) {
       bo_offset |= GEN6_PIPE_CONTROL_DW2_USE_GGTT;
       reloc_flags |= INTEL_RELOC_GGTT;
    }

    pos = cmd_batch_pointer(cmd, cmd_len, &dw);
    dw[0] = dw0;
    dw[1] = dw1;
    dw[2] = 0;
    dw[3] = (uint32_t) imm;
    dw[4] = (uint32_t) (imm >> 32);

    if (bo) {
        cmd_reserve_reloc(cmd, 1);
        cmd_batch_reloc(cmd, pos + 2, bo, bo_offset, reloc_flags);
    }
 }

 static bool gen6_can_primitive_restart(const struct intel_cmd *cmd)
 {
     const struct intel_pipeline *p = cmd->bind.pipeline.graphics;
     bool supported;

     CMD_ASSERT(cmd, 6, 7.5);

     if (cmd_gen(cmd) >= INTEL_GEN(7.5))
         return (p->prim_type != GEN6_3DPRIM_RECTLIST);

     switch (p->prim_type) {
     case GEN6_3DPRIM_POINTLIST:
     case GEN6_3DPRIM_LINELIST:
     case GEN6_3DPRIM_LINESTRIP:
     case GEN6_3DPRIM_TRILIST:
     case GEN6_3DPRIM_TRISTRIP:
         supported = true;
         break;
     default:
         supported = false;
         break;
     }

     if (!supported)
         return false;

     switch (cmd->bind.index.type) {
     case VK_INDEX_TYPE_UINT16:
         supported = (p->primitive_restart_index != 0xffffu);
         break;
     case VK_INDEX_TYPE_UINT32:
         supported = (p->primitive_restart_index != 0xffffffffu);
         break;
     default:
         supported = false;
         break;
     }

     return supported;
 }

 static void gen6_3DSTATE_INDEX_BUFFER(struct intel_cmd *cmd,
                                       const struct intel_buf *buf,
                                       VkDeviceSize offset,
                                       VkIndexType type,
                                       bool enable_cut_index)
 {
     const uint8_t cmd_len = 3;
     uint32_t dw0, end_offset, *dw;
     unsigned offset_align;
     uint32_t pos;

     CMD_ASSERT(cmd, 6, 7.5);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2);

     /* the bit is moved to 3DSTATE_VF */
     if (cmd_gen(cmd) >= INTEL_GEN(7.5))
         assert(!enable_cut_index);
     if (enable_cut_index)
         dw0 |= GEN6_IB_DW0_CUT_INDEX_ENABLE;

     switch (type) {
     case VK_INDEX_TYPE_UINT16:
         dw0 |= GEN6_IB_DW0_FORMAT_WORD;
         offset_align = 2;
         break;
     case VK_INDEX_TYPE_UINT32:
         dw0 |= GEN6_IB_DW0_FORMAT_DWORD;
         offset_align = 4;
         break;
     default:
         assert(!"unsupported index type");
         break;
     }

     /* aligned and inclusive */
     end_offset = buf->size - (buf->size % offset_align) - 1;

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;

     cmd_reserve_reloc(cmd, 2);
     cmd_batch_reloc(cmd, pos + 1, buf->obj.mem->bo, offset, 0);
     cmd_batch_reloc(cmd, pos + 2, buf->obj.mem->bo, end_offset, 0);
 }

 static void gen75_3DSTATE_VF(struct intel_cmd *cmd,
                              bool enable_cut_index,
                              uint32_t cut_index)
 {
     const uint8_t cmd_len = 2;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 7.5, 7.5);

     dw0 = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2);
     if (enable_cut_index)
         dw0 |=  GEN75_VF_DW0_CUT_INDEX_ENABLE;

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cut_index;
 }

 static void gen6_add_scratch_space(struct intel_cmd *cmd,
                                    uint32_t batch_pos,
                                    const struct intel_pipeline *pipeline,
                                    const struct intel_pipeline_shader *sh)
 {
     int scratch_space;

     CMD_ASSERT(cmd, 6, 7.5);

     assert(sh->per_thread_scratch_size &&
            sh->per_thread_scratch_size % 1024 == 0 &&
            u_is_pow2(sh->per_thread_scratch_size) &&
            sh->scratch_offset % 1024 == 0);
     scratch_space = u_ffs(sh->per_thread_scratch_size) - 11;

     cmd_reserve_reloc(cmd, 1);
     cmd_batch_reloc(cmd, batch_pos, pipeline->obj.mem->bo,
             sh->scratch_offset | scratch_space, INTEL_RELOC_WRITE);
 }

 static void gen6_3DSTATE_GS(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *gs = &pipeline->gs;
     const uint8_t cmd_len = 7;
     uint32_t dw0, dw2, dw4, dw5, dw6, *dw;
     CMD_ASSERT(cmd, 6, 6);
     int vue_read_len = 0;
     int pos = 0;

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);

     if (pipeline->active_shaders & SHADER_GEOMETRY_FLAG) {

         // based on ilo_gpe_init_gs_cso_gen6
         vue_read_len = (gs->in_count + 1) / 2;
         if (!vue_read_len)
             vue_read_len = 1;

         dw2 = (gs->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
               gs->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT |
               GEN6_THREADDISP_SPF;

         dw4 = vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
               0 << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
               gs->urb_grf_start << GEN6_GS_DW4_URB_GRF_START__SHIFT;

         dw5 = (gs->max_threads - 1) << GEN6_GS_DW5_MAX_THREADS__SHIFT |
               GEN6_GS_DW5_STATISTICS |
               GEN6_GS_DW5_RENDER_ENABLE;

         dw6 = GEN6_GS_DW6_GS_ENABLE;

         if (gs->discard_adj)
             dw6 |= GEN6_GS_DW6_DISCARD_ADJACENCY;

     } else {
         dw2 = 0;
         dw4 = 0;
         dw5 = GEN6_GS_DW5_STATISTICS;
         dw6 = 0;
     }

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.gs_offset;
     dw[2] = dw2;
     dw[3] = 0;
     dw[4] = dw4;
     dw[5] = dw5;
     dw[6] = dw6;

     if (gs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, gs);
 }

 static void gen7_3DSTATE_GS(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *gs = &pipeline->gs;
     const uint8_t cmd_len = 7;
     uint32_t dw0, dw2, dw4, dw5, dw6, *dw;
     CMD_ASSERT(cmd, 7, 7.5);
     int vue_read_len = 0;
     int pos = 0;

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);

     if (pipeline->active_shaders & SHADER_GEOMETRY_FLAG) {

         // based on upload_gs_state
         dw2 = (gs->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
               gs->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

         vue_read_len = (gs->in_count + 1) / 2;
         if (!vue_read_len)
             vue_read_len = 1;

         dw4 = (gs->output_size_hwords * 2 - 1) << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT |
                gs->output_topology << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT |
                vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT |
                0 << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT |
                gs->urb_grf_start << GEN7_GS_DW4_URB_GRF_START__SHIFT;


         dw5 = gs->control_data_header_size_hwords << GEN7_GS_DW5_CONTROL_DATA_HEADER_SIZE__SHIFT |
               (gs->invocations - 1) << GEN7_GS_DW5_INSTANCE_CONTROL__SHIFT |
               GEN7_GS_DW5_STATISTICS |
               GEN7_GS_DW5_GS_ENABLE;

         dw5 |= (gs->dual_instanced_dispatch) ? GEN7_GS_DW5_DISPATCH_MODE_DUAL_INSTANCE
                                              : GEN7_GS_DW5_DISPATCH_MODE_DUAL_OBJECT;

         if (gs->include_primitive_id)
             dw5 |= GEN7_GS_DW5_INCLUDE_PRIMITIVE_ID;

         if (cmd_gen(cmd) >= INTEL_GEN(7.5)) {
             dw5 |= (gs->max_threads - 1) << GEN75_GS_DW5_MAX_THREADS__SHIFT;
             dw5 |= GEN75_GS_DW5_REORDER_TRAILING;
             dw6  = gs->control_data_format << GEN75_GS_DW6_GSCTRL__SHIFT;
         } else {
             dw5 |= (gs->max_threads - 1) << GEN7_GS_DW5_MAX_THREADS__SHIFT;
             dw5 |= gs->control_data_format << GEN7_GS_DW5_GSCTRL__SHIFT;
             dw6  = 0;
         }
     } else {
         dw2 = 0;
         dw4 = 0;
         dw5 = GEN7_GS_DW5_STATISTICS;
         dw6 = 0;
     }

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.gs_offset;
     dw[2] = dw2;
     dw[3] = 0;
     dw[4] = dw4;
     dw[5] = dw5;
     dw[6] = dw6;

     if (gs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, gs);
 }

 static void gen6_3DSTATE_DRAWING_RECTANGLE(struct intel_cmd *cmd,
                                            uint32_t width, uint32_t height)
 {
     const uint8_t cmd_len = 4;
     const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_DRAWING_RECTANGLE) |
                          (cmd_len - 2);
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;

     if (width && height) {
         dw[1] = 0;
         dw[2] = (height - 1) << 16 |
                 (width - 1);
     } else {
         dw[1] = 1;
         dw[2] = 0;
     }

     dw[3] = 0;
 }

 static void gen7_fill_3DSTATE_SF_body(const struct intel_cmd *cmd,
                                       uint32_t body[6])
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_render_pass *rp = cmd->bind.render_pass;
     const struct intel_render_pass_subpass *subpass =
         cmd->bind.render_pass_subpass;
     const struct intel_dynamic_line_width *line_width = &cmd->bind.state.line_width;
     const struct intel_dynamic_depth_bias *depth_bias = &cmd->bind.state.depth_bias;
     uint32_t dw1, dw2, dw3, dw4, dw5, dw6;

     CMD_ASSERT(cmd, 6, 7.5);

     dw1 = GEN7_SF_DW1_STATISTICS |
           GEN7_SF_DW1_DEPTH_OFFSET_SOLID |
           GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME |
           GEN7_SF_DW1_DEPTH_OFFSET_POINT |
           GEN7_SF_DW1_VIEWPORT_ENABLE |
           pipeline->cmd_sf_fill;

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         int format = GEN6_ZFORMAT_D32_FLOAT;

         if (subpass->ds_index < rp->attachment_count) {
             switch (rp->attachments[subpass->ds_index].format) {
             case VK_FORMAT_D16_UNORM:
                 format = GEN6_ZFORMAT_D16_UNORM;
                 break;
             case VK_FORMAT_D32_SFLOAT:
             case VK_FORMAT_D32_SFLOAT_S8_UINT:
                 format = GEN6_ZFORMAT_D32_FLOAT;
                 break;
             default:
                 assert(!"unsupported depth/stencil format");
                 break;
             }
         }

         dw1 |= format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
     }

     dw2 = pipeline->cmd_sf_cull;

     /* Scissor is always enabled */
     dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;

     // TODO: line width support
     (void) line_width;

     if (pipeline->sample_count > 1) {
           dw2 |= 128 << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
                  GEN7_SF_DW2_MSRASTMODE_ON_PATTERN;
     } else {
           dw2 |= 0 << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
                  GEN7_SF_DW2_MSRASTMODE_OFF_PIXEL;
     }

     dw3 = 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
           1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
           2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT |
           GEN7_SF_DW3_SUBPIXEL_8BITS;

     if (pipeline->depthBiasEnable) {
         dw4 = u_fui((float) depth_bias->depth_bias * 2.0f);
         dw5 = u_fui(depth_bias->slope_scaled_depth_bias);
         dw6 = u_fui(depth_bias->depth_bias_clamp);
     } else {
         dw4 = 0;
         dw5 = 0;
         dw6 = 0;
     }

     body[0] = dw1;
     body[1] = dw2;
     body[2] = dw3;
     body[3] = dw4;
     body[4] = dw5;
     body[5] = dw6;
 }

 static void gen6_3DSTATE_SF(struct intel_cmd *cmd)
 {
     const uint8_t cmd_len = 20;
     const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_SF) |
                          (cmd_len - 2);
     const uint32_t *sbe = cmd->bind.pipeline.graphics->cmd_3dstate_sbe;
     uint32_t sf[6];
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 6);

     gen7_fill_3DSTATE_SF_body(cmd, sf);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = sbe[1];
     memcpy(&dw[2], sf, sizeof(sf));
     memcpy(&dw[8], &sbe[2], 12);
 }

 static void gen7_3DSTATE_SF(struct intel_cmd *cmd)
 {
     const uint8_t cmd_len = 7;
     uint32_t *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) |
             (cmd_len - 2);
     gen7_fill_3DSTATE_SF_body(cmd, &dw[1]);
 }

 static void gen6_3DSTATE_CLIP(struct intel_cmd *cmd)
 {
     const uint8_t cmd_len = 4;
     const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) |
                          (cmd_len - 2);
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *vs = &pipeline->vs;
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const struct intel_dynamic_viewport *viewport = &cmd->bind.state.viewport;
     uint32_t dw1, dw2, dw3, *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     dw1 = GEN6_CLIP_DW1_STATISTICS;
     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         dw1 |= GEN7_CLIP_DW1_SUBPIXEL_8BITS |
                GEN7_CLIP_DW1_EARLY_CULL_ENABLE |
                pipeline->cmd_clip_cull;
     }

     dw2 = GEN6_CLIP_DW2_CLIP_ENABLE |
           GEN6_CLIP_DW2_APIMODE_D3D | /* depth range [0, 1] */
           GEN6_CLIP_DW2_XY_TEST_ENABLE |
           (vs->enable_user_clip ? 1 : 0) << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT |
           2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
           1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
           2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;

     if (pipeline->rasterizerDiscardEnable)
         dw2 |= GEN6_CLIP_DW2_CLIPMODE_REJECT_ALL;
     else
         dw2 |= GEN6_CLIP_DW2_CLIPMODE_NORMAL;

     if (pipeline->depthClipEnable)
         dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE;

     if (fs->barycentric_interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL |
                                    GEN6_INTERP_NONPERSPECTIVE_CENTROID |
                                    GEN6_INTERP_NONPERSPECTIVE_SAMPLE))
         dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE;

     dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT |
           0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT |
           (viewport->viewport_count - 1);

     /* TODO: framebuffer requests layer_count > 1 */
     if (cmd->bind.fb->array_size == 1) {
         dw3 |= GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO;
     }

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = dw1;
     dw[2] = dw2;
     dw[3] = dw3;
 }

 static void gen6_3DSTATE_WM(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const uint8_t cmd_len = 9;
     uint32_t pos;
     uint32_t dw0, dw2, dw4, dw5, dw6, dw8, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);

     dw2 = (fs->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
           fs->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

     dw4 = GEN6_WM_DW4_STATISTICS |
           fs->urb_grf_start << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
           0 << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
           fs->urb_grf_start_16 << GEN6_WM_DW4_URB_GRF_START2__SHIFT;

     dw5 = (fs->max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT |
           GEN6_WM_DW5_PS_DISPATCH_ENABLE |
           GEN6_PS_DISPATCH_8 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;

     if (fs->offset_16)
         dw5 |= GEN6_PS_DISPATCH_16 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;

     if (fs->uses & INTEL_SHADER_USE_KILL ||
         pipeline->alphaToCoverageEnable)
         dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;

     if (fs->computed_depth_mode)
         dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
     if (fs->uses & INTEL_SHADER_USE_DEPTH)
         dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
     if (fs->uses & INTEL_SHADER_USE_W)
         dw5 |= GEN6_WM_DW5_PS_USE_W;

     if (pipeline->dual_source_blend_enable)
         dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;

     dw6 = fs->in_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
           GEN6_WM_DW6_PS_POSOFFSET_NONE |
           GEN6_WM_DW6_ZW_INTERP_PIXEL |
           fs->barycentric_interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT |
           GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT;

     if (pipeline->sample_count > 1) {
         dw6 |= GEN6_WM_DW6_MSRASTMODE_ON_PATTERN |
                GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
     } else {
         dw6 |= GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL |
                GEN6_WM_DW6_MSDISPMODE_PERSAMPLE;
     }

     dw8 = (fs->offset_16) ? cmd->bind.pipeline.fs_offset + fs->offset_16 : 0;

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.fs_offset;
     dw[2] = dw2;
     dw[3] = 0; /* scratch */
     dw[4] = dw4;
     dw[5] = dw5;
     dw[6] = dw6;
     dw[7] = 0; /* kernel 1 */
     dw[8] = dw8; /* kernel 2 */

     if (fs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, fs);
 }

 static void gen7_3DSTATE_WM(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const uint8_t cmd_len = 3;
     uint32_t dw0, dw1, dw2, *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);

     dw1 = GEN7_WM_DW1_STATISTICS |
           GEN7_WM_DW1_PS_DISPATCH_ENABLE |
           GEN7_WM_DW1_ZW_INTERP_PIXEL |
           fs->barycentric_interps << GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT |
           GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;

     if (fs->uses & INTEL_SHADER_USE_KILL ||
         pipeline->alphaToCoverageEnable)
         dw1 |= GEN7_WM_DW1_PS_KILL_PIXEL;

     dw1 |= fs->computed_depth_mode << GEN7_WM_DW1_PSCDEPTH__SHIFT;

     if (fs->uses & INTEL_SHADER_USE_DEPTH)
         dw1 |= GEN7_WM_DW1_PS_USE_DEPTH;
     if (fs->uses & INTEL_SHADER_USE_W)
         dw1 |= GEN7_WM_DW1_PS_USE_W;

     dw2 = 0;

     if (pipeline->sample_count > 1) {
         dw1 |= GEN7_WM_DW1_MSRASTMODE_ON_PATTERN;
         dw2 |= GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
     } else {
         dw1 |= GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL;
         dw2 |= GEN7_WM_DW2_MSDISPMODE_PERSAMPLE;
     }

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = dw1;
     dw[2] = dw2;
 }

 static void gen7_3DSTATE_PS(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const uint8_t cmd_len = 8;
     uint32_t dw0, dw2, dw4, dw5, dw7, *dw;
     uint32_t pos;

     CMD_ASSERT(cmd, 7, 7.5);

     dw0 = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);

     dw2 = (fs->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
           fs->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

     dw4 = GEN7_PS_DW4_POSOFFSET_NONE |
           GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;

     if (fs->offset_16)
         dw4 |= GEN6_PS_DISPATCH_16 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;

     if (cmd_gen(cmd) >= INTEL_GEN(7.5)) {
         dw4 |= (fs->max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
         dw4 |= pipeline->cmd_sample_mask << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
     } else {
         dw4 |= (fs->max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
     }

     if (fs->in_count)
         dw4 |= GEN7_PS_DW4_ATTR_ENABLE;

     if (pipeline->dual_source_blend_enable)
         dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;

     dw5 = fs->urb_grf_start << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
           0 << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
           fs->urb_grf_start_16 << GEN7_PS_DW5_URB_GRF_START2__SHIFT;

     dw7 = (fs->offset_16) ? cmd->bind.pipeline.fs_offset + fs->offset_16 : 0;

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.fs_offset;
     dw[2] = dw2;
     dw[3] = 0; /* scratch */
     dw[4] = dw4;
     dw[5] = dw5;
     dw[6] = 0; /* kernel 1 */
     dw[7] = dw7; /* kernel 2 */

     if (fs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, fs);
 }

 static void gen6_3DSTATE_MULTISAMPLE(struct intel_cmd *cmd,
                                      uint32_t sample_count)
 {
     const uint8_t cmd_len = (cmd_gen(cmd) >= INTEL_GEN(7)) ? 4 : 3;
     uint32_t dw1, dw2, dw3, *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     switch (sample_count) {
     case 4:
         dw1 = GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
         dw2 = cmd->dev->sample_pattern_4x;
         dw3 = 0;
         break;
     case 8:
         assert(cmd_gen(cmd) >= INTEL_GEN(7));
         dw1 = GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
         dw2 = cmd->dev->sample_pattern_8x[0];
         dw3 = cmd->dev->sample_pattern_8x[1];
         break;
     default:
         assert(sample_count <= 1);
         dw1 = GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
         dw2 = 0;
         dw3 = 0;
         break;
     }

     cmd_batch_pointer(cmd, cmd_len, &dw);

     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2);
     dw[1] = dw1;
     dw[2] = dw2;
     if (cmd_gen(cmd) >= INTEL_GEN(7))
         dw[3] = dw3;
 }

 static void gen6_3DSTATE_DEPTH_BUFFER(struct intel_cmd *cmd,
                                       const struct intel_att_view *view,
                                       bool optimal_ds)
 {
     const uint8_t cmd_len = 7;
     uint32_t dw0, *dw;
     uint32_t pos;

     CMD_ASSERT(cmd, 6, 7.5);

     dw0 = (cmd_gen(cmd) >= INTEL_GEN(7)) ?
         GEN7_RENDER_CMD(3D, 3DSTATE_DEPTH_BUFFER) :
         GEN6_RENDER_CMD(3D, 3DSTATE_DEPTH_BUFFER);
     dw0 |= (cmd_len - 2);

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;

     dw[1] = view->att_cmd[0];
     /* note that we only enable HiZ on Gen7+ */
     if (!optimal_ds)
         dw[1] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;

     dw[2] = 0;
     dw[3] = view->att_cmd[2];
     dw[4] = view->att_cmd[3];
     dw[5] = view->att_cmd[4];
     dw[6] = view->att_cmd[5];

     if (view->img) {
         cmd_reserve_reloc(cmd, 1);
         cmd_batch_reloc(cmd, pos + 2, view->img->obj.mem->bo,
                 view->att_cmd[1], INTEL_RELOC_WRITE);
     }
 }

 static void gen6_3DSTATE_STENCIL_BUFFER(struct intel_cmd *cmd,
                                         const struct intel_att_view *view,
                                         bool optimal_ds)
 {
     const uint8_t cmd_len = 3;
     uint32_t dw0, *dw;
     uint32_t pos;

     CMD_ASSERT(cmd, 6, 7.5);

     dw0 = (cmd_gen(cmd) >= INTEL_GEN(7)) ?
         GEN7_RENDER_CMD(3D, 3DSTATE_STENCIL_BUFFER) :
         GEN6_RENDER_CMD(3D, 3DSTATE_STENCIL_BUFFER);
     dw0 |= (cmd_len - 2);

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;

     if (view->has_stencil) {
         dw[1] = view->att_cmd[6];

         cmd_reserve_reloc(cmd, 1);
         cmd_batch_reloc(cmd, pos + 2, view->img->obj.mem->bo,
                 view->att_cmd[7], INTEL_RELOC_WRITE);
     } else {
         dw[1] = 0;
         dw[2] = 0;
     }
 }

 static void gen6_3DSTATE_HIER_DEPTH_BUFFER(struct intel_cmd *cmd,
                                            const struct intel_att_view *view,
                                            bool optimal_ds)
 {
     const uint8_t cmd_len = 3;
     uint32_t dw0, *dw;
     uint32_t pos;

     CMD_ASSERT(cmd, 6, 7.5);

     dw0 = (cmd_gen(cmd) >= INTEL_GEN(7)) ?
         GEN7_RENDER_CMD(3D, 3DSTATE_HIER_DEPTH_BUFFER) :
         GEN6_RENDER_CMD(3D, 3DSTATE_HIER_DEPTH_BUFFER);
     dw0 |= (cmd_len - 2);

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;

     if (view->has_hiz && optimal_ds) {
         dw[1] = view->att_cmd[8];

         cmd_reserve_reloc(cmd, 1);
         cmd_batch_reloc(cmd, pos + 2, view->img->obj.mem->bo,
                 view->att_cmd[9], INTEL_RELOC_WRITE);
     } else {
         dw[1] = 0;
         dw[2] = 0;
     }
 }

 static void gen6_3DSTATE_CLEAR_PARAMS(struct intel_cmd *cmd,
                                       uint32_t clear_val)
 {
     const uint8_t cmd_len = 2;
     const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_CLEAR_PARAMS) |
                          GEN6_CLEAR_PARAMS_DW0_VALID |
                          (cmd_len - 2);
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 6);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = clear_val;
 }

 static void gen7_3DSTATE_CLEAR_PARAMS(struct intel_cmd *cmd,
                                       uint32_t clear_val)
 {
     const uint8_t cmd_len = 3;
     const uint32_t dw0 = GEN7_RENDER_CMD(3D, 3DSTATE_CLEAR_PARAMS) |
                          (cmd_len - 2);
     uint32_t *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = clear_val;
     dw[2] = 1;
 }

 static void gen6_3DSTATE_CC_STATE_POINTERS(struct intel_cmd *cmd,
                                            uint32_t blend_offset,
                                            uint32_t ds_offset,
                                            uint32_t cc_offset)
 {
     const uint8_t cmd_len = 4;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_CC_STATE_POINTERS) |
           (cmd_len - 2);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = blend_offset | 1;
     dw[2] = ds_offset | 1;
     dw[3] = cc_offset | 1;
 }

 static void gen6_3DSTATE_VIEWPORT_STATE_POINTERS(struct intel_cmd *cmd,
                                                  uint32_t clip_offset,
                                                  uint32_t sf_offset,
                                                  uint32_t cc_offset)
 {
     const uint8_t cmd_len = 4;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_VIEWPORT_STATE_POINTERS) |
           GEN6_VP_PTR_DW0_CLIP_CHANGED |
           GEN6_VP_PTR_DW0_SF_CHANGED |
           GEN6_VP_PTR_DW0_CC_CHANGED |
           (cmd_len - 2);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = clip_offset;
     dw[2] = sf_offset;
     dw[3] = cc_offset;
 }

 static void gen6_3DSTATE_SCISSOR_STATE_POINTERS(struct intel_cmd *cmd,
                                                 uint32_t scissor_offset)
 {
     const uint8_t cmd_len = 2;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_SCISSOR_STATE_POINTERS) |
           (cmd_len - 2);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = scissor_offset;
 }

 static void gen6_3DSTATE_BINDING_TABLE_POINTERS(struct intel_cmd *cmd,
                                                 uint32_t vs_offset,
                                                 uint32_t gs_offset,
                                                 uint32_t ps_offset)
 {
     const uint8_t cmd_len = 4;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_BINDING_TABLE_POINTERS) |
           GEN6_BINDING_TABLE_PTR_DW0_VS_CHANGED |
           GEN6_BINDING_TABLE_PTR_DW0_GS_CHANGED |
           GEN6_BINDING_TABLE_PTR_DW0_PS_CHANGED |
           (cmd_len - 2);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = vs_offset;
     dw[2] = gs_offset;
     dw[3] = ps_offset;
 }

 static void gen6_3DSTATE_SAMPLER_STATE_POINTERS(struct intel_cmd *cmd,
                                                 uint32_t vs_offset,
                                                 uint32_t gs_offset,
                                                 uint32_t ps_offset)
 {
     const uint8_t cmd_len = 4;
     uint32_t dw0, *dw;

     CMD_ASSERT(cmd, 6, 6);

     dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLER_STATE_POINTERS) |
           GEN6_SAMPLER_PTR_DW0_VS_CHANGED |
           GEN6_SAMPLER_PTR_DW0_GS_CHANGED |
           GEN6_SAMPLER_PTR_DW0_PS_CHANGED |
           (cmd_len - 2);

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = vs_offset;
     dw[2] = gs_offset;
     dw[3] = ps_offset;
 }

 static void gen7_3dstate_pointer(struct intel_cmd *cmd,
                                  int subop, uint32_t offset)
 {
     const uint8_t cmd_len = 2;
     const uint32_t dw0 = GEN6_RENDER_TYPE_RENDER |
                          GEN6_RENDER_SUBTYPE_3D |
                          subop | (cmd_len - 2);
     uint32_t *dw;

     cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = offset;
 }

 static uint32_t gen6_BLEND_STATE(struct intel_cmd *cmd)
 {
     const uint8_t cmd_align = GEN6_ALIGNMENT_BLEND_STATE;
     const uint8_t cmd_len = INTEL_MAX_RENDER_TARGETS * 2;
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;

     CMD_ASSERT(cmd, 6, 7.5);
     STATIC_ASSERT(ARRAY_SIZE(pipeline->cmd_cb) >= INTEL_MAX_RENDER_TARGETS);

     return cmd_state_write(cmd, INTEL_CMD_ITEM_BLEND, cmd_align, cmd_len, pipeline->cmd_cb);
 }

 static uint32_t gen6_DEPTH_STENCIL_STATE(struct intel_cmd *cmd,
                                          const struct intel_dynamic_stencil *stencil_state)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const uint8_t cmd_align = GEN6_ALIGNMENT_DEPTH_STENCIL_STATE;
     const uint8_t cmd_len = 3;
     uint32_t dw[3];

     dw[0] = pipeline->cmd_depth_stencil;

     /* TODO: enable back facing stencil state */
     /* same read and write masks for both front and back faces */
     dw[1] = (stencil_state->front.stencil_compare_mask & 0xff) << 24 |
             (stencil_state->front.stencil_write_mask & 0xff) << 16 |
             (stencil_state->front.stencil_compare_mask & 0xff) << 8 |
             (stencil_state->front.stencil_write_mask & 0xff);
     dw[2] = pipeline->cmd_depth_test;

     CMD_ASSERT(cmd, 6, 7.5);

     if (stencil_state->front.stencil_write_mask && pipeline->stencilTestEnable)
        dw[0] |= 1 << 18;

     return cmd_state_write(cmd, INTEL_CMD_ITEM_DEPTH_STENCIL,
             cmd_align, cmd_len, dw);
 }

 static uint32_t gen6_COLOR_CALC_STATE(struct intel_cmd *cmd,
                                       uint32_t stencil_ref,
                                       const uint32_t blend_color[4])
 {
     const uint8_t cmd_align = GEN6_ALIGNMENT_COLOR_CALC_STATE;
     const uint8_t cmd_len = 6;
     uint32_t offset, *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     offset = cmd_state_pointer(cmd, INTEL_CMD_ITEM_COLOR_CALC,
             cmd_align, cmd_len, &dw);
     dw[0] = stencil_ref;
     dw[1] = 0;
     dw[2] = blend_color[0];
     dw[3] = blend_color[1];
     dw[4] = blend_color[2];
     dw[5] = blend_color[3];

     return offset;
 }

 static void cmd_wa_gen6_pre_depth_stall_write(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 6, 7.5);

     if (!cmd->bind.draw_count)
         return;

     if (cmd->bind.wa_flags & INTEL_CMD_WA_GEN6_PRE_DEPTH_STALL_WRITE)
         return;

     cmd->bind.wa_flags |= INTEL_CMD_WA_GEN6_PRE_DEPTH_STALL_WRITE;

    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 60:
     *
     *     "Pipe-control with CS-stall bit set must be sent BEFORE the
     *      pipe-control with a post-sync op and no write-cache flushes."
     *
     * The workaround below necessitates this workaround.
     */
     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_CS_STALL |
             GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL,
             NULL, 0, 0);

     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_WRITE_IMM,
             cmd->scratch_bo, 0, 0);
 }

 static void cmd_wa_gen6_pre_command_scoreboard_stall(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 6, 7.5);

     if (!cmd->bind.draw_count)
         return;

     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL,
             NULL, 0, 0);
 }

 static void cmd_wa_gen7_pre_vs_depth_stall_write(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 7, 7.5);

     if (!cmd->bind.draw_count)
         return;

     cmd_wa_gen6_pre_depth_stall_write(cmd);

     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_DEPTH_STALL | GEN6_PIPE_CONTROL_WRITE_IMM,
             cmd->scratch_bo, 0, 0);
 }

 static void cmd_wa_gen7_post_command_cs_stall(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 7, 7.5);

     /*
      * From the Ivy Bridge PRM, volume 2 part 1, page 61:
      *
      *     "One of the following must also be set (when CS stall is set):
      *
      *       * Render Target Cache Flush Enable ([12] of DW1)
      *       * Depth Cache Flush Enable ([0] of DW1)
      *       * Stall at Pixel Scoreboard ([1] of DW1)
      *       * Depth Stall ([13] of DW1)
      *       * Post-Sync Operation ([13] of DW1)"
      */
     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_CS_STALL |
             GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL,
             NULL, 0, 0);
 }

 static void cmd_wa_gen7_post_command_depth_stall(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 7, 7.5);

     cmd_wa_gen6_pre_depth_stall_write(cmd);

     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_DEPTH_STALL, NULL, 0, 0);
 }

 static void cmd_wa_gen6_pre_multisample_depth_flush(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 6, 7.5);

     if (!cmd->bind.draw_count)
         return;

     /*
      * From the Sandy Bridge PRM, volume 2 part 1, page 305:
      *
      *     "Driver must guarentee that all the caches in the depth pipe are
      *      flushed before this command (3DSTATE_MULTISAMPLE) is parsed. This
      *      requires driver to send a PIPE_CONTROL with a CS stall along with
      *      a Depth Flush prior to this command."
      *
      * From the Ivy Bridge PRM, volume 2 part 1, page 304:
      *
      *     "Driver must ierarchi that all the caches in the depth pipe are
      *      flushed before this command (3DSTATE_MULTISAMPLE) is parsed. This
      *      requires driver to send a PIPE_CONTROL with a CS stall along with
      *      a Depth Flush prior to this command.
      */
     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH |
             GEN6_PIPE_CONTROL_CS_STALL,
             NULL, 0, 0);
 }

 static void cmd_wa_gen6_pre_ds_flush(struct intel_cmd *cmd)
 {
     CMD_ASSERT(cmd, 6, 7.5);

     if (!cmd->bind.draw_count)
         return;

     /*
      * From the Ivy Bridge PRM, volume 2 part 1, page 315:
      *
      *     "Driver must send a least one PIPE_CONTROL command with CS Stall
      *      and a post sync operation prior to the group of depth
      *      commands(3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
      *      3DSTATE_STENCIL_BUFFER, and 3DSTATE_HIER_DEPTH_BUFFER)."
      *
      * This workaround satifies all the conditions.
      */
     cmd_wa_gen6_pre_depth_stall_write(cmd);

     /*
      * From the Ivy Bridge PRM, volume 2 part 1, page 315:
      *
      *     "Restriction: Prior to changing Depth/Stencil Buffer state (i.e.,
      *      any combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
      *      3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
      *      issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
      *      set), followed by a pipelined depth cache flush (PIPE_CONTROL with
      *      Depth Flush Bit set, followed by another pipelined depth stall
      *      (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
      *      guarantee that the pipeline from WM onwards is already flushed
      *      (e.g., via a preceding MI_FLUSH)."
      */
     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_DEPTH_STALL, NULL, 0, 0);
     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH, NULL, 0, 0);
     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_DEPTH_STALL, NULL, 0, 0);
 }

 void cmd_batch_state_base_address(struct intel_cmd *cmd)
 {
     const uint8_t cmd_len = 10;
     const uint32_t dw0 = GEN6_RENDER_CMD(COMMON, STATE_BASE_ADDRESS) |
                          (cmd_len - 2);
     const uint32_t mocs = (cmd_gen(cmd) >= INTEL_GEN(7)) ?
         (GEN7_MOCS_L3_WB << 8 | GEN7_MOCS_L3_WB << 4) : 0;
     uint32_t pos;
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);

     dw[0] = dw0;
     /* start offsets */
     dw[1] = mocs | 1;
     dw[2] = 1;
     dw[3] = 1;
     dw[4] = 1;
     dw[5] = 1;
     /* end offsets */
     dw[6] = 1;
     dw[7] = 1 + 0xfffff000;
     dw[8] = 1 + 0xfffff000;
     dw[9] = 1;

     cmd_reserve_reloc(cmd, 3);
     cmd_batch_reloc_writer(cmd, pos + 2, INTEL_CMD_WRITER_SURFACE,
             cmd->writers[INTEL_CMD_WRITER_SURFACE].sba_offset + 1);
     cmd_batch_reloc_writer(cmd, pos + 3, INTEL_CMD_WRITER_STATE,
             cmd->writers[INTEL_CMD_WRITER_STATE].sba_offset + 1);
     cmd_batch_reloc_writer(cmd, pos + 5, INTEL_CMD_WRITER_INSTRUCTION,
             cmd->writers[INTEL_CMD_WRITER_INSTRUCTION].sba_offset + 1);
 }

 void cmd_batch_push_const_alloc(struct intel_cmd *cmd)
 {
     const uint32_t size = (cmd->dev->gpu->gt == 3) ? 16 : 8;
     const uint8_t cmd_len = 2;
     uint32_t offset = 0;
     uint32_t *dw;

     if (cmd_gen(cmd) <= INTEL_GEN(6))
         return;

     CMD_ASSERT(cmd, 7, 7.5);

     /* 3DSTATE_PUSH_CONSTANT_ALLOC_x */
     cmd_batch_pointer(cmd, cmd_len * 5, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_VS) | (cmd_len - 2);
     dw[1] = offset << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
             size << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;
     offset += size;

     dw += 2;
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_PS) | (cmd_len - 2);
     dw[1] = offset << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
             size << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;

     dw += 2;
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_HS) | (cmd_len - 2);
     dw[1] = 0 << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
             0 << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;

     dw += 2;
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_DS) | (cmd_len - 2);
     dw[1] = 0 << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
             0 << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;

     dw += 2;
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_GS) | (cmd_len - 2);
     dw[1] = 0 << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
             0 << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;

     /*
      *
      * From the Ivy Bridge PRM, volume 2 part 1, page 292:
      *
      *     "A PIPE_CONTOL command with the CS Stall bit set must be programmed
      *      in the ring after this instruction
      *      (3DSTATE_PUSH_CONSTANT_ALLOC_PS)."
      */
     cmd_wa_gen7_post_command_cs_stall(cmd);
 }

 void cmd_batch_flush(struct intel_cmd *cmd, uint32_t pipe_control_dw0)
 {
     if (pipe_control_dw0 == 0)
         return;

     if (!cmd->bind.draw_count)
         return;

     assert(!(pipe_control_dw0 & GEN6_PIPE_CONTROL_WRITE__MASK));

     /*
      * From the Sandy Bridge PRM, volume 2 part 1, page 60:
      *
      *     "Before a PIPE_CONTROL with Write Cache Flush Enable =1, a
      *      PIPE_CONTROL with any non-zero post-sync-op is required."
      */
     if (pipe_control_dw0 & GEN6_PIPE_CONTROL_RENDER_CACHE_FLUSH)
         cmd_wa_gen6_pre_depth_stall_write(cmd);

     /*
      * From the Ivy Bridge PRM, volume 2 part 1, page 61:
      *
      *     "One of the following must also be set (when CS stall is set):
      *
      *       * Render Target Cache Flush Enable ([12] of DW1)
      *       * Depth Cache Flush Enable ([0] of DW1)
      *       * Stall at Pixel Scoreboard ([1] of DW1)
      *       * Depth Stall ([13] of DW1)
      *       * Post-Sync Operation ([13] of DW1)"
      */
     if ((pipe_control_dw0 & GEN6_PIPE_CONTROL_CS_STALL) &&
         !(pipe_control_dw0 & (GEN6_PIPE_CONTROL_RENDER_CACHE_FLUSH |
                               GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                               GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL |
                               GEN6_PIPE_CONTROL_DEPTH_STALL)))
         pipe_control_dw0 |= GEN6_PIPE_CONTROL_PIXEL_SCOREBOARD_STALL;

     gen6_PIPE_CONTROL(cmd, pipe_control_dw0, NULL, 0, 0);
 }

 void cmd_batch_flush_all(struct intel_cmd *cmd)
 {
     cmd_batch_flush(cmd, GEN6_PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE |
                          GEN6_PIPE_CONTROL_RENDER_CACHE_FLUSH |
                          GEN6_PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                          GEN6_PIPE_CONTROL_VF_CACHE_INVALIDATE |
                          GEN6_PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
                          GEN6_PIPE_CONTROL_CS_STALL);
 }

 void cmd_batch_depth_count(struct intel_cmd *cmd,
                            struct intel_bo *bo,
                            VkDeviceSize offset)
 {
     cmd_wa_gen6_pre_depth_stall_write(cmd);

     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_DEPTH_STALL |
             GEN6_PIPE_CONTROL_WRITE_PS_DEPTH_COUNT,
             bo, offset, 0);
 }

 void cmd_batch_timestamp(struct intel_cmd *cmd,
                          struct intel_bo *bo,
                          VkDeviceSize offset)
 {
     /* need any WA or stall? */
     gen6_PIPE_CONTROL(cmd, GEN6_PIPE_CONTROL_WRITE_TIMESTAMP, bo, offset, 0);
 }

 void cmd_batch_immediate(struct intel_cmd *cmd,
                          uint32_t pipe_control_flags,
                          struct intel_bo *bo,
                          VkDeviceSize offset,
                          uint64_t val)
 {
     /* need any WA or stall? */
     gen6_PIPE_CONTROL(cmd,
             GEN6_PIPE_CONTROL_WRITE_IMM | pipe_control_flags,
             bo, offset, val);
 }

 static void gen6_cc_states(struct intel_cmd *cmd)
 {
     const struct intel_dynamic_blend *blend = &cmd->bind.state.blend;
     const struct intel_dynamic_stencil *ss = &cmd->bind.state.stencil;
     uint32_t blend_offset, ds_offset, cc_offset;
     uint32_t stencil_ref;
     uint32_t blend_color[4];

     CMD_ASSERT(cmd, 6, 6);

     blend_offset = gen6_BLEND_STATE(cmd);

     if (blend)
         memcpy(blend_color, blend->blend_const, sizeof(blend_color));
     else
         memset(blend_color, 0, sizeof(blend_color));

     if (ss) {
         ds_offset = gen6_DEPTH_STENCIL_STATE(cmd, ss);
         /* TODO: enable back facing stencil state */
         /* same reference for both front and back faces */
         stencil_ref = (ss->front.stencil_reference & 0xff) << 24 |
                       (ss->front.stencil_reference & 0xff) << 16;
     } else {
         ds_offset = 0;
         stencil_ref = 0;
     }

     cc_offset = gen6_COLOR_CALC_STATE(cmd, stencil_ref, blend_color);

     gen6_3DSTATE_CC_STATE_POINTERS(cmd, blend_offset, ds_offset, cc_offset);
 }

 static void gen6_viewport_states(struct intel_cmd *cmd)
 {
     const struct intel_dynamic_viewport *viewport = &cmd->bind.state.viewport;
     uint32_t sf_offset, clip_offset, cc_offset, scissor_offset;

     if (!viewport)
         return;

     assert(viewport->cmd_len == (8 + 4 + 2) *
             /* viewports */ viewport->viewport_count + (/* scissor */ viewport->viewport_count * 2));

     sf_offset = cmd_state_write(cmd, INTEL_CMD_ITEM_SF_VIEWPORT,
             GEN6_ALIGNMENT_SF_VIEWPORT, 8 * viewport->viewport_count,
             viewport->cmd);

     clip_offset = cmd_state_write(cmd, INTEL_CMD_ITEM_CLIP_VIEWPORT,
             GEN6_ALIGNMENT_CLIP_VIEWPORT, 4 * viewport->viewport_count,
             &viewport->cmd[viewport->cmd_clip_pos]);

     cc_offset = cmd_state_write(cmd, INTEL_CMD_ITEM_CC_VIEWPORT,
             GEN6_ALIGNMENT_SF_VIEWPORT, 2 * viewport->viewport_count,
             &viewport->cmd[viewport->cmd_cc_pos]);

     scissor_offset = cmd_state_write(cmd, INTEL_CMD_ITEM_SCISSOR_RECT,
             GEN6_ALIGNMENT_SCISSOR_RECT, 2 * viewport->viewport_count,
             &viewport->cmd[viewport->cmd_scissor_rect_pos]);

     gen6_3DSTATE_VIEWPORT_STATE_POINTERS(cmd,
             clip_offset, sf_offset, cc_offset);

     gen6_3DSTATE_SCISSOR_STATE_POINTERS(cmd, scissor_offset);
 }

 static void gen7_cc_states(struct intel_cmd *cmd)
 {
     const struct intel_dynamic_blend *blend = &cmd->bind.state.blend;
     const struct intel_dynamic_depth_bounds *ds = &cmd->bind.state.depth_bounds;
     const struct intel_dynamic_stencil *ss = &cmd->bind.state.stencil;
     uint32_t stencil_ref;
     uint32_t blend_color[4];
     uint32_t offset;

     CMD_ASSERT(cmd, 7, 7.5);

     if (!blend && !ds)
         return;

     offset = gen6_BLEND_STATE(cmd);
     gen7_3dstate_pointer(cmd,
             GEN7_RENDER_OPCODE_3DSTATE_BLEND_STATE_POINTERS, offset);

     if (blend)
         memcpy(blend_color, blend->blend_const, sizeof(blend_color));
     else
         memset(blend_color, 0, sizeof(blend_color));

     if (ss) {
         offset = gen6_DEPTH_STENCIL_STATE(cmd, ss);
         /* TODO: enable back facing stencil state */
         /* same reference for both front and back faces */
         stencil_ref = (ss->front.stencil_reference & 0xff) << 24 |
                       (ss->front.stencil_reference & 0xff) << 16;
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_DEPTH_STENCIL_STATE_POINTERS,
                 offset);
     } else {
         stencil_ref = 0;
     }

     offset = gen6_COLOR_CALC_STATE(cmd, stencil_ref, blend_color);
     gen7_3dstate_pointer(cmd,
             GEN6_RENDER_OPCODE_3DSTATE_CC_STATE_POINTERS, offset);
 }

 static void gen7_viewport_states(struct intel_cmd *cmd)
 {
     const struct intel_dynamic_viewport *viewport = &cmd->bind.state.viewport;
     uint32_t offset;

     if (!viewport)
         return;

     assert(viewport->cmd_len == (16 + 2 + 2) * viewport->viewport_count);

     offset = cmd_state_write(cmd, INTEL_CMD_ITEM_SF_VIEWPORT,
             GEN7_ALIGNMENT_SF_CLIP_VIEWPORT, 16 * viewport->viewport_count,
             viewport->cmd);
     gen7_3dstate_pointer(cmd,
             GEN7_RENDER_OPCODE_3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP,
             offset);

     offset = cmd_state_write(cmd, INTEL_CMD_ITEM_CC_VIEWPORT,
             GEN6_ALIGNMENT_CC_VIEWPORT, 2 * viewport->viewport_count,
             &viewport->cmd[viewport->cmd_cc_pos]);
     gen7_3dstate_pointer(cmd,
             GEN7_RENDER_OPCODE_3DSTATE_VIEWPORT_STATE_POINTERS_CC,
             offset);

     offset = cmd_state_write(cmd, INTEL_CMD_ITEM_SCISSOR_RECT,
                              GEN6_ALIGNMENT_SCISSOR_RECT, 2 * viewport->viewport_count,
                              &viewport->cmd[viewport->cmd_scissor_rect_pos]);
     gen7_3dstate_pointer(cmd,
                          GEN6_RENDER_OPCODE_3DSTATE_SCISSOR_STATE_POINTERS,
                          offset);
 }

 static void gen6_pcb(struct intel_cmd *cmd, int subop,
                      const struct intel_pipeline_shader *sh)
 {
     const uint8_t cmd_len = 5;
     uint32_t *dw;

     cmd_batch_pointer(cmd, cmd_len, &dw);

     dw[0] = GEN6_RENDER_TYPE_RENDER |
             GEN6_RENDER_SUBTYPE_3D |
             subop | (cmd_len - 2);
     dw[1] = 0;
     dw[2] = 0;
     dw[3] = 0;
     dw[4] = 0;
 }

 static void gen7_pcb(struct intel_cmd *cmd, int subop,
                      const struct intel_pipeline_shader *sh)
 {
     const uint8_t cmd_len = 7;
     uint32_t *dw;

     cmd_batch_pointer(cmd, cmd_len, &dw);

     dw[0] = GEN6_RENDER_TYPE_RENDER |
             GEN6_RENDER_SUBTYPE_3D |
             subop | (cmd_len - 2);
     dw[1] = 0;
     dw[2] = 0;
     dw[3] = 0;
     dw[4] = 0;
     dw[5] = 0;
     dw[6] = 0;
 }

 static uint32_t emit_samplers(struct intel_cmd *cmd,
                               const struct intel_pipeline_rmap *rmap)
 {
     const struct intel_desc_region *region = cmd->dev->desc_region;
     const struct intel_cmd_dset_data *data = &cmd->bind.dset.graphics_data;
     const uint32_t border_len = (cmd_gen(cmd) >= INTEL_GEN(7)) ? 4 : 12;
     const uint32_t border_stride =
         u_align(border_len, GEN6_ALIGNMENT_SAMPLER_BORDER_COLOR_STATE / 4);
     uint32_t border_offset, *border_dw, sampler_offset, *sampler_dw;
     uint32_t surface_count;
     uint32_t i;

     CMD_ASSERT(cmd, 6, 7.5);

     if (!rmap || !rmap->sampler_count)
         return 0;

     surface_count = rmap->rt_count + rmap->texture_resource_count + rmap->resource_count + rmap->uav_count;

     /*
      * note that we cannot call cmd_state_pointer() here as the following
      * cmd_state_pointer() would invalidate the pointer
      */
     border_offset = cmd_state_reserve(cmd, INTEL_CMD_ITEM_BLOB,
             GEN6_ALIGNMENT_SAMPLER_BORDER_COLOR_STATE,
             border_stride * rmap->sampler_count);

     sampler_offset = cmd_state_pointer(cmd, INTEL_CMD_ITEM_SAMPLER,
             GEN6_ALIGNMENT_SAMPLER_STATE,
             4 * rmap->sampler_count, &sampler_dw);

     cmd_state_update(cmd, border_offset,
             border_stride * rmap->sampler_count, &border_dw);

     for (i = 0; i < rmap->sampler_count; i++) {
         const struct intel_pipeline_rmap_slot *slot =
             &rmap->slots[surface_count + i];
         struct intel_desc_offset desc_offset;
         const struct intel_sampler *sampler;

         switch (slot->type) {
         case INTEL_PIPELINE_RMAP_SAMPLER:
             intel_desc_offset_add(&desc_offset, &slot->u.sampler,
                     &data->set_offsets[slot->index]);
             intel_desc_region_read_sampler(region, &desc_offset, &sampler);
             break;
         case INTEL_PIPELINE_RMAP_UNUSED:
             sampler = NULL;
             break;
         default:
             assert(!"unexpected rmap type");
             sampler = NULL;
             break;
         }

         if (sampler) {
             memcpy(border_dw, &sampler->cmd[3], border_len * 4);

             sampler_dw[0] = sampler->cmd[0];
             sampler_dw[1] = sampler->cmd[1];
             sampler_dw[2] = border_offset;
             sampler_dw[3] = sampler->cmd[2];
         } else {
             sampler_dw[0] = GEN6_SAMPLER_DW0_DISABLE;
             sampler_dw[1] = 0;
             sampler_dw[2] = 0;
             sampler_dw[3] = 0;
         }

         border_offset += border_stride * 4;
         border_dw += border_stride;
         sampler_dw += 4;
     }

     return sampler_offset;
 }

 static uint32_t emit_binding_table(struct intel_cmd *cmd,
                                    const struct intel_pipeline_rmap *rmap,
                                    const VkShaderStageFlagBits stage)
 {
     const struct intel_desc_region *region = cmd->dev->desc_region;
     const struct intel_cmd_dset_data *data = &cmd->bind.dset.graphics_data;
     const uint32_t sba_offset =
         cmd->writers[INTEL_CMD_WRITER_SURFACE].sba_offset;
     uint32_t binding_table[256], offset;
     uint32_t surface_count, i;

     CMD_ASSERT(cmd, 6, 7.5);

     surface_count = (rmap) ?
         rmap->rt_count + rmap->texture_resource_count + rmap->resource_count + rmap->uav_count : 0;
     if (!surface_count)
         return 0;

     assert(surface_count <= ARRAY_SIZE(binding_table));

     for (i = 0; i < surface_count; i++) {
         const struct intel_pipeline_rmap_slot *slot = &rmap->slots[i];
         struct intel_null_view null_view;
         bool need_null_view = false;

         switch (slot->type) {
         case INTEL_PIPELINE_RMAP_RT:
             {
                 const struct intel_render_pass_subpass *subpass =
                     cmd->bind.render_pass_subpass;
                 const struct intel_fb *fb = cmd->bind.fb;
                 const struct intel_att_view *view =
                     (slot->index < subpass->color_count &&
                      subpass->color_indices[slot->index] < fb->view_count) ?
                     fb->views[subpass->color_indices[slot->index]] : NULL;

                 if (view) {
                     offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_SURFACE,
                             GEN6_ALIGNMENT_SURFACE_STATE,
                             view->cmd_len, view->att_cmd);

                     cmd_reserve_reloc(cmd, 1);
                     cmd_surface_reloc(cmd, offset, 1, view->img->obj.mem->bo,
                             view->att_cmd[1], INTEL_RELOC_WRITE);
                 } else {
                     need_null_view = true;
                 }
             }
             break;
         case INTEL_PIPELINE_RMAP_SURFACE:
             {
                 const struct intel_pipeline_layout U_ASSERT_ONLY *pipeline_layout =
                     cmd->bind.pipeline.graphics->pipeline_layout;
                 const int32_t dyn_idx = slot->u.surface.dynamic_offset_index;
                 struct intel_desc_offset desc_offset;
                 const struct intel_mem *mem;
                 bool read_only;
                 const uint32_t *cmd_data;
                 uint32_t cmd_len;

                 assert(dyn_idx < 0 ||
                         dyn_idx < pipeline_layout->total_dynamic_desc_count);

                 intel_desc_offset_add(&desc_offset, &slot->u.surface.offset,
                         &data->set_offsets[slot->index]);

                 intel_desc_region_read_surface(region, &desc_offset, stage,
                         &mem, &read_only, &cmd_data, &cmd_len);
                 if (mem) {
                     const uint32_t dynamic_offset = (dyn_idx >= 0) ?
                         data->dynamic_offsets[dyn_idx] : 0;
                     const uint32_t reloc_flags =
                         (read_only) ? 0 : INTEL_RELOC_WRITE;

                     offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_SURFACE,
                             GEN6_ALIGNMENT_SURFACE_STATE,
                             cmd_len, cmd_data);

                     cmd_reserve_reloc(cmd, 1);
                     cmd_surface_reloc(cmd, offset, 1, mem->bo,
                             cmd_data[1] + dynamic_offset, reloc_flags);
                 } else {
                     need_null_view = true;
                 }
             }
             break;
         case INTEL_PIPELINE_RMAP_UNUSED:
             need_null_view = true;
             break;
         default:
             assert(!"unexpected rmap type");
             need_null_view = true;
             break;
         }

         if (need_null_view) {
             intel_null_view_init(&null_view, cmd->dev);
             offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_SURFACE,
                     GEN6_ALIGNMENT_SURFACE_STATE,
                     null_view.cmd_len, null_view.cmd);
         }

         binding_table[i] = offset - sba_offset;
     }

     offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_BINDING_TABLE,
             GEN6_ALIGNMENT_BINDING_TABLE_STATE,
             surface_count, binding_table) - sba_offset;

     /* there is a 64KB limit on BINIDNG_TABLE_STATEs */
     assert(offset + sizeof(uint32_t) * surface_count <= 64 * 1024);

     return offset;
 }

 static void gen6_3DSTATE_VERTEX_BUFFERS(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const uint8_t cmd_len = 1 + 4 * pipeline->vb_count;
     uint32_t *dw;
     uint32_t pos, i;

     CMD_ASSERT(cmd, 6, 7.5);

     if (!pipeline->vb_count)
         return;

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);

     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_BUFFERS) | (cmd_len - 2);
     dw++;
     pos++;

     for (i = 0; i < pipeline->vb_count; i++) {
         assert(pipeline->vb[i].stride <= 2048);

         dw[0] = i << GEN6_VB_DW0_INDEX__SHIFT |
                 pipeline->vb[i].stride;

         if (cmd_gen(cmd) >= INTEL_GEN(7)) {
             dw[0] |= GEN7_MOCS_L3_WB << GEN6_VB_DW0_MOCS__SHIFT |
                      GEN7_VB_DW0_ADDR_MODIFIED;
         }

         switch (pipeline->vb[i].inputRate) {
         case VK_VERTEX_INPUT_RATE_VERTEX:
             dw[0] |= GEN6_VB_DW0_ACCESS_VERTEXDATA;
             dw[3] = 0;
             break;
         case VK_VERTEX_INPUT_RATE_INSTANCE:
             dw[0] |= GEN6_VB_DW0_ACCESS_INSTANCEDATA;
             dw[3] = 1;
             break;
         default:
             assert(!"unknown step rate");
             dw[0] |= GEN6_VB_DW0_ACCESS_VERTEXDATA;
             dw[3] = 0;
             break;
         }

         if (cmd->bind.vertex.buf[i]) {
             const struct intel_buf *buf = cmd->bind.vertex.buf[i];
             const VkDeviceSize offset = cmd->bind.vertex.offset[i];

             cmd_reserve_reloc(cmd, 2);
             cmd_batch_reloc(cmd, pos + 1, buf->obj.mem->bo, offset, 0);
             cmd_batch_reloc(cmd, pos + 2, buf->obj.mem->bo, buf->size - 1, 0);
         } else {
             dw[0] |= GEN6_VB_DW0_IS_NULL;
             dw[1] = 0;
             dw[2] = 0;
         }

         dw += 4;
         pos += 4;
     }
 }

 static void gen6_3DSTATE_VS(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *vs = &pipeline->vs;
     const uint8_t cmd_len = 6;
     const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
     uint32_t dw2, dw4, dw5, *dw;
     uint32_t pos;
     int vue_read_len;

     CMD_ASSERT(cmd, 6, 7.5);

     /*
      * From the Sandy Bridge PRM, volume 2 part 1, page 135:
      *
      *     "(Vertex URB Entry Read Length) Specifies the number of pairs of
      *      128-bit vertex elements to be passed into the payload for each
      *      vertex."
      *
      *     "It is UNDEFINED to set this field to 0 indicating no Vertex URB
      *      data to be read and passed to the thread."
      */
     vue_read_len = (vs->in_count + 1) / 2;
     if (!vue_read_len)
         vue_read_len = 1;

     dw2 = (vs->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
           vs->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;

     dw4 = vs->urb_grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
           vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
           0 << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT;

     dw5 = GEN6_VS_DW5_STATISTICS |
           GEN6_VS_DW5_VS_ENABLE;

     if (cmd_gen(cmd) >= INTEL_GEN(7.5))
         dw5 |= (vs->max_threads - 1) << GEN75_VS_DW5_MAX_THREADS__SHIFT;
     else
         dw5 |= (vs->max_threads - 1) << GEN6_VS_DW5_MAX_THREADS__SHIFT;

     if (pipeline->disable_vs_cache)
         dw5 |= GEN6_VS_DW5_CACHE_DISABLE;

     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.vs_offset;
     dw[2] = dw2;
     dw[3] = 0; /* scratch */
     dw[4] = dw4;
     dw[5] = dw5;

     if (vs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, vs);
 }

 static void emit_shader_resources(struct intel_cmd *cmd)
 {
     /* five HW shader stages */
     uint32_t binding_tables[5], samplers[5];

     binding_tables[0] = emit_binding_table(cmd,
             cmd->bind.pipeline.graphics->vs.rmap,
             VK_SHADER_STAGE_VERTEX_BIT);
     binding_tables[1] = emit_binding_table(cmd,
             cmd->bind.pipeline.graphics->tcs.rmap,
             VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
     binding_tables[2] = emit_binding_table(cmd,
             cmd->bind.pipeline.graphics->tes.rmap,
             VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
     binding_tables[3] = emit_binding_table(cmd,
             cmd->bind.pipeline.graphics->gs.rmap,
             VK_SHADER_STAGE_GEOMETRY_BIT);
     binding_tables[4] = emit_binding_table(cmd,
             cmd->bind.pipeline.graphics->fs.rmap,
             VK_SHADER_STAGE_FRAGMENT_BIT);

     samplers[0] = emit_samplers(cmd, cmd->bind.pipeline.graphics->vs.rmap);
     samplers[1] = emit_samplers(cmd, cmd->bind.pipeline.graphics->tcs.rmap);
     samplers[2] = emit_samplers(cmd, cmd->bind.pipeline.graphics->tes.rmap);
     samplers[3] = emit_samplers(cmd, cmd->bind.pipeline.graphics->gs.rmap);
     samplers[4] = emit_samplers(cmd, cmd->bind.pipeline.graphics->fs.rmap);

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_VS,
                 binding_tables[0]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_HS,
                 binding_tables[1]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_DS,
                 binding_tables[2]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_GS,
                 binding_tables[3]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_PS,
                 binding_tables[4]);

         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_SAMPLER_STATE_POINTERS_VS,
                 samplers[0]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_SAMPLER_STATE_POINTERS_HS,
                 samplers[1]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_SAMPLER_STATE_POINTERS_DS,
                 samplers[2]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_SAMPLER_STATE_POINTERS_GS,
                 samplers[3]);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_SAMPLER_STATE_POINTERS_PS,
                 samplers[4]);
     } else {
         assert(!binding_tables[1] && !binding_tables[2]);
         gen6_3DSTATE_BINDING_TABLE_POINTERS(cmd,
                 binding_tables[0], binding_tables[3], binding_tables[4]);

         assert(!samplers[1] && !samplers[2]);
         gen6_3DSTATE_SAMPLER_STATE_POINTERS(cmd,
                 samplers[0], samplers[3], samplers[4]);
     }
 }

 static void emit_msaa(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;

     if (!cmd->bind.render_pass_changed)
         return;

     cmd_wa_gen6_pre_multisample_depth_flush(cmd);
     gen6_3DSTATE_MULTISAMPLE(cmd, pipeline->sample_count);
 }

 static void emit_rt(struct intel_cmd *cmd)
 {
     const struct intel_fb *fb = cmd->bind.fb;

     if (!cmd->bind.render_pass_changed)
         return;

     cmd_wa_gen6_pre_depth_stall_write(cmd);
     gen6_3DSTATE_DRAWING_RECTANGLE(cmd, fb->width,
             fb->height);
 }

 static void emit_ds(struct intel_cmd *cmd)
 {
     const struct intel_render_pass *rp = cmd->bind.render_pass;
     const struct intel_render_pass_subpass *subpass =
         cmd->bind.render_pass_subpass;
     const struct intel_fb *fb = cmd->bind.fb;
     const struct intel_att_view *view =
         (subpass->ds_index < rp->attachment_count) ?
         fb->views[subpass->ds_index] : NULL;

     if (!cmd->bind.render_pass_changed)
         return;

     if (!view) {
         /* all zeros */
         static const struct intel_att_view null_view;
         view = &null_view;
     }

     cmd_wa_gen6_pre_ds_flush(cmd);
     gen6_3DSTATE_DEPTH_BUFFER(cmd, view, subpass->ds_optimal);
     gen6_3DSTATE_STENCIL_BUFFER(cmd, view, subpass->ds_optimal);
     gen6_3DSTATE_HIER_DEPTH_BUFFER(cmd, view, subpass->ds_optimal);

     if (cmd_gen(cmd) >= INTEL_GEN(7))
         gen7_3DSTATE_CLEAR_PARAMS(cmd, 0);
     else
         gen6_3DSTATE_CLEAR_PARAMS(cmd, 0);
 }

 static uint32_t emit_shader(struct intel_cmd *cmd,
                             const struct intel_pipeline_shader *shader)
 {
     struct intel_cmd_shader_cache *cache = &cmd->bind.shader_cache;
     uint32_t offset;
     uint32_t i;

     /* see if the shader is already in the cache */
     for (i = 0; i < cache->used; i++) {
         if (cache->entries[i].shader == (const void *) shader)
             return cache->entries[i].kernel_offset;
     }

     offset = cmd_instruction_write(cmd, shader->codeSize, shader->pCode);

     /* grow the cache if full */
     if (cache->used >= cache->count) {
         const uint32_t count = cache->count + 16;
         void *entries;

         entries = intel_alloc(cmd, sizeof(cache->entries[0]) * count, 0,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
         if (entries) {
             if (cache->entries) {
                 memcpy(entries, cache->entries,
                         sizeof(cache->entries[0]) * cache->used);
                 intel_free(cmd, cache->entries);
             }

             cache->entries = entries;
             cache->count = count;
         }
     }

     /* add the shader to the cache */
     if (cache->used < cache->count) {
         cache->entries[cache->used].shader = (const void *) shader;
         cache->entries[cache->used].kernel_offset = offset;
         cache->used++;
     }

     return offset;
 }

 static void emit_graphics_pipeline(struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;

     if (pipeline->wa_flags & INTEL_CMD_WA_GEN6_PRE_DEPTH_STALL_WRITE)
         cmd_wa_gen6_pre_depth_stall_write(cmd);
     if (pipeline->wa_flags & INTEL_CMD_WA_GEN6_PRE_COMMAND_SCOREBOARD_STALL)
         cmd_wa_gen6_pre_command_scoreboard_stall(cmd);
     if (pipeline->wa_flags & INTEL_CMD_WA_GEN7_PRE_VS_DEPTH_STALL_WRITE)
         cmd_wa_gen7_pre_vs_depth_stall_write(cmd);

     /* 3DSTATE_URB_VS and etc. */
     assert(pipeline->cmd_len);
     cmd_batch_write(cmd, pipeline->cmd_len, pipeline->cmds);

     if (pipeline->active_shaders & SHADER_VERTEX_FLAG) {
         cmd->bind.pipeline.vs_offset = emit_shader(cmd, &pipeline->vs);
     }
     if (pipeline->active_shaders & SHADER_TESS_CONTROL_FLAG) {
         cmd->bind.pipeline.tcs_offset = emit_shader(cmd, &pipeline->tcs);
     }
     if (pipeline->active_shaders & SHADER_TESS_EVAL_FLAG) {
         cmd->bind.pipeline.tes_offset = emit_shader(cmd, &pipeline->tes);
     }
     if (pipeline->active_shaders & SHADER_GEOMETRY_FLAG) {
         cmd->bind.pipeline.gs_offset = emit_shader(cmd, &pipeline->gs);
     }
     if (pipeline->active_shaders & SHADER_FRAGMENT_FLAG) {
         cmd->bind.pipeline.fs_offset = emit_shader(cmd, &pipeline->fs);
     }

     if (pipeline->wa_flags & INTEL_CMD_WA_GEN7_POST_COMMAND_CS_STALL)
         cmd_wa_gen7_post_command_cs_stall(cmd);
     if (pipeline->wa_flags & INTEL_CMD_WA_GEN7_POST_COMMAND_DEPTH_STALL)
         cmd_wa_gen7_post_command_depth_stall(cmd);
 }

 static void
 viewport_get_guardband(const struct intel_gpu *gpu,
                        int center_x, int center_y,
                        int *min_gbx, int *max_gbx,
                        int *min_gby, int *max_gby)
 {
    /*
     * From the Sandy Bridge PRM, volume 2 part 1, page 234:
     *
     *     "Per-Device Guardband Extents
     *
     *       - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1]
     *       - Maximum Post-Clamp Delta (X or Y): 16K"
     *
     *     "In addition, in order to be correctly rendered, objects must have a
     *      screenspace bounding box not exceeding 8K in the X or Y direction.
     *      This additional restriction must also be comprehended by software,
     *      i.e., enforced by use of clipping."
     *
     * From the Ivy Bridge PRM, volume 2 part 1, page 248:
     *
     *     "Per-Device Guardband Extents
     *
     *       - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1]
     *       - Maximum Post-Clamp Delta (X or Y): N/A"
     *
     *     "In addition, in order to be correctly rendered, objects must have a
     *      screenspace bounding box not exceeding 8K in the X or Y direction.
     *      This additional restriction must also be comprehended by software,
     *      i.e., enforced by use of clipping."
     *
     * Combined, the bounding box of any object can not exceed 8K in both
     * width and height.
     *
     * Below we set the guardband as a squre of length 8K, centered at where
     * the viewport is.  This makes sure all objects passing the GB test are
     * valid to the renderer, and those failing the XY clipping have a
     * better chance of passing the GB test.
     */
    const int max_extent = (intel_gpu_gen(gpu) >= INTEL_GEN(7)) ? 32768 : 16384;
    const int half_len = 8192 / 2;

    /* make sure the guardband is within the valid range */
    if (center_x - half_len < -max_extent)
       center_x = -max_extent + half_len;
    else if (center_x + half_len > max_extent - 1)
       center_x = max_extent - half_len;

    if (center_y - half_len < -max_extent)
       center_y = -max_extent + half_len;
    else if (center_y + half_len > max_extent - 1)
       center_y = max_extent - half_len;

    *min_gbx = (float) (center_x - half_len);
    *max_gbx = (float) (center_x + half_len);
    *min_gby = (float) (center_y - half_len);
    *max_gby = (float) (center_y + half_len);
 }

 static void
 viewport_state_cmd(struct intel_dynamic_viewport *state,
                    const struct intel_gpu *gpu,
                    uint32_t count)
 {
     INTEL_GPU_ASSERT(gpu, 6, 7.5);

     state->viewport_count = count;

     assert(count <= INTEL_MAX_VIEWPORTS);

     if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) {
         state->cmd_len = 16 * count;

         state->cmd_clip_pos = 8;
     } else {
         state->cmd_len = 8 * count;

         state->cmd_clip_pos = state->cmd_len;
         state->cmd_len += 4 * count;
     }

     state->cmd_cc_pos = state->cmd_len;
     state->cmd_len += 2 * count;

     state->cmd_scissor_rect_pos = state->cmd_len;
     state->cmd_len += 2 * count;

     assert(sizeof(uint32_t) * state->cmd_len <= sizeof(state->cmd));
 }

 static void
 set_viewport_state(
         struct intel_cmd*               cmd)
 {
     const struct intel_gpu *gpu = cmd->dev->gpu;
     struct intel_dynamic_viewport *state = &cmd->bind.state.viewport;
     const uint32_t sf_stride = (intel_gpu_gen(gpu) >= INTEL_GEN(7)) ? 16 : 8;
     const uint32_t clip_stride = (intel_gpu_gen(gpu) >= INTEL_GEN(7)) ? 16 : 4;
     uint32_t *sf_viewport, *clip_viewport, *cc_viewport, *scissor_rect;
     uint32_t i;

     INTEL_GPU_ASSERT(gpu, 6, 7.5);

     viewport_state_cmd(state, gpu, cmd->bind.state.viewport.viewport_count);

     sf_viewport = state->cmd;
     clip_viewport = state->cmd + state->cmd_clip_pos;
     cc_viewport = state->cmd + state->cmd_cc_pos;
     scissor_rect = state->cmd + state->cmd_scissor_rect_pos;

     for (i = 0; i < cmd->bind.state.viewport.viewport_count; i++) {
         const VkViewport *viewport = &cmd->bind.state.viewport.viewports[i];
         uint32_t *dw = NULL;
         float translate[3], scale[3];
         int min_gbx, max_gbx, min_gby, max_gby;

         scale[0] = viewport->width / 2.0f;
         scale[1] = viewport->height / 2.0f;
         scale[2] = viewport->maxDepth - viewport->minDepth;
         translate[0] = viewport->x + scale[0];
         translate[1] = viewport->y + scale[1];
         translate[2] = viewport->minDepth;

         viewport_get_guardband(gpu, (int) translate[0], (int) translate[1],
                 &min_gbx, &max_gbx, &min_gby, &max_gby);

         /* SF_VIEWPORT */
         dw = sf_viewport;
         dw[0] = u_fui(scale[0]);
         dw[1] = u_fui(scale[1]);
         dw[2] = u_fui(scale[2]);
         dw[3] = u_fui(translate[0]);
         dw[4] = u_fui(translate[1]);
         dw[5] = u_fui(translate[2]);
         dw[6] = 0;
         dw[7] = 0;
         sf_viewport += sf_stride;

         /* CLIP_VIEWPORT */
         dw = clip_viewport;
         dw[0] = u_fui(((float) min_gbx - translate[0]) / fabsf(scale[0]));
         dw[1] = u_fui(((float) max_gbx - translate[0]) / fabsf(scale[0]));
         dw[2] = u_fui(((float) min_gby - translate[1]) / fabsf(scale[1]));
         dw[3] = u_fui(((float) max_gby - translate[1]) / fabsf(scale[1]));
         clip_viewport += clip_stride;

         /* CC_VIEWPORT */
         dw = cc_viewport;
         dw[0] = u_fui(viewport->minDepth);
         dw[1] = u_fui(viewport->maxDepth);
         cc_viewport += 2;
     }

     for (i = 0; i < cmd->bind.state.viewport.viewport_count; i++) {
         const VkRect2D *scissor = &cmd->bind.state.viewport.scissors[i];
         /* SCISSOR_RECT */
         int16_t max_x, max_y;
         uint32_t *dw = NULL;

         max_x = (scissor->offset.x + scissor->extent.width - 1) & 0xffff;
         max_y = (scissor->offset.y + scissor->extent.height - 1) & 0xffff;

         dw = scissor_rect;
         if (scissor->extent.width && scissor->extent.height) {
             dw[0] = (scissor->offset.y & 0xffff) << 16 |
                                                     (scissor->offset.x & 0xffff);
             dw[1] = max_y << 16 | max_x;
         } else {
             dw[0] = 1 << 16 | 1;
             dw[1] = 0;
         }
         scissor_rect += 2;
     }
 }

 static void emit_bounded_states(struct intel_cmd *cmd)
 {
     set_viewport_state(cmd);

     emit_msaa(cmd);

     emit_graphics_pipeline(cmd);

     emit_rt(cmd);
     emit_ds(cmd);

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         gen7_cc_states(cmd);
         gen7_viewport_states(cmd);

         gen7_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_VS,
                 &cmd->bind.pipeline.graphics->vs);
         gen7_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_GS,
                 &cmd->bind.pipeline.graphics->gs);
         gen7_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_PS,
                 &cmd->bind.pipeline.graphics->fs);

         gen7_3DSTATE_GS(cmd);
         gen6_3DSTATE_CLIP(cmd);
         gen7_3DSTATE_SF(cmd);
         gen7_3DSTATE_WM(cmd);
         gen7_3DSTATE_PS(cmd);
     } else {
         gen6_cc_states(cmd);
         gen6_viewport_states(cmd);

         gen6_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_VS,
                 &cmd->bind.pipeline.graphics->vs);
         gen6_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_GS,
                 &cmd->bind.pipeline.graphics->gs);
         gen6_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_PS,
                 &cmd->bind.pipeline.graphics->fs);

         gen6_3DSTATE_GS(cmd);
         gen6_3DSTATE_CLIP(cmd);
         gen6_3DSTATE_SF(cmd);
         gen6_3DSTATE_WM(cmd);
     }

     emit_shader_resources(cmd);

     cmd_wa_gen6_pre_depth_stall_write(cmd);

     gen6_3DSTATE_VERTEX_BUFFERS(cmd);
     gen6_3DSTATE_VS(cmd);
 }

 static uint32_t gen6_meta_DEPTH_STENCIL_STATE(struct intel_cmd *cmd,
                                               const struct intel_cmd_meta *meta)
 {
     const uint8_t cmd_align = GEN6_ALIGNMENT_DEPTH_STENCIL_STATE;
     const uint8_t cmd_len = 3;
     uint32_t dw[3];

     CMD_ASSERT(cmd, 6, 7.5);

     /* TODO: aspect is now a mask, can you do both? */
     if (meta->ds.aspect == VK_IMAGE_ASPECT_DEPTH_BIT) {
         dw[0] = 0;
         dw[1] = 0;

         if (meta->ds.op == INTEL_CMD_META_DS_RESOLVE) {
             dw[2] = GEN6_ZS_DW2_DEPTH_TEST_ENABLE |
                     GEN6_COMPAREFUNCTION_NEVER << 27 |
                     GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
         } else {
             dw[2] = GEN6_COMPAREFUNCTION_ALWAYS << 27 |
                     GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
         }
     } else if (meta->ds.aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
         dw[0] = GEN6_ZS_DW0_STENCIL_TEST_ENABLE |
                 (GEN6_COMPAREFUNCTION_ALWAYS) << 28 |
                 (GEN6_STENCILOP_KEEP) << 25 |
                 (GEN6_STENCILOP_KEEP) << 22 |
                 (GEN6_STENCILOP_REPLACE) << 19 |
                 GEN6_ZS_DW0_STENCIL_WRITE_ENABLE |
                 GEN6_ZS_DW0_STENCIL1_ENABLE |
                 (GEN6_COMPAREFUNCTION_ALWAYS) << 12 |
                 (GEN6_STENCILOP_KEEP) << 9 |
                 (GEN6_STENCILOP_KEEP) << 6 |
                 (GEN6_STENCILOP_REPLACE) << 3;

         dw[1] = 0xff << GEN6_ZS_DW1_STENCIL0_VALUEMASK__SHIFT |
                 0xff << GEN6_ZS_DW1_STENCIL0_WRITEMASK__SHIFT |
                 0xff << GEN6_ZS_DW1_STENCIL1_VALUEMASK__SHIFT |
                 0xff << GEN6_ZS_DW1_STENCIL1_WRITEMASK__SHIFT;
         dw[2] = 0;
     }

     return cmd_state_write(cmd, INTEL_CMD_ITEM_DEPTH_STENCIL,
             cmd_align, cmd_len, dw);
 }

 static void gen6_meta_dynamic_states(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t blend_offset, ds_offset, cc_offset, cc_vp_offset, *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     blend_offset = 0;
     ds_offset = 0;
     cc_offset = 0;
     cc_vp_offset = 0;

     if (meta->mode == INTEL_CMD_META_FS_RECT) {
         /* BLEND_STATE */
         blend_offset = cmd_state_pointer(cmd, INTEL_CMD_ITEM_BLEND,
                 GEN6_ALIGNMENT_BLEND_STATE, 2, &dw);
         dw[0] = 0;
         dw[1] = GEN6_RT_DW1_COLORCLAMP_RTFORMAT | 0x3;
     }

     if (meta->mode != INTEL_CMD_META_VS_POINTS) {
         if (meta->ds.aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
             meta->ds.aspect == VK_IMAGE_ASPECT_STENCIL_BIT) {
             const uint32_t blend_color[4] = { 0, 0, 0, 0 };
             uint32_t stencil_ref = (meta->ds.stencil_ref & 0xff) << 24 |
                                    (meta->ds.stencil_ref & 0xff) << 16;

             /* DEPTH_STENCIL_STATE */
             ds_offset = gen6_meta_DEPTH_STENCIL_STATE(cmd, meta);

             /* COLOR_CALC_STATE */
             cc_offset = gen6_COLOR_CALC_STATE(cmd,
                     stencil_ref, blend_color);

             /* CC_VIEWPORT */
             cc_vp_offset = cmd_state_pointer(cmd, INTEL_CMD_ITEM_CC_VIEWPORT,
                     GEN6_ALIGNMENT_CC_VIEWPORT, 2, &dw);
             dw[0] = u_fui(0.0f);
             dw[1] = u_fui(1.0f);
         } else {
             /* DEPTH_STENCIL_STATE */
             ds_offset = cmd_state_pointer(cmd, INTEL_CMD_ITEM_DEPTH_STENCIL,
                     GEN6_ALIGNMENT_DEPTH_STENCIL_STATE,
                     GEN6_DEPTH_STENCIL_STATE__SIZE, &dw);
             memset(dw, 0, sizeof(*dw) * GEN6_DEPTH_STENCIL_STATE__SIZE);
         }
     }

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_BLEND_STATE_POINTERS,
                 blend_offset);
         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_DEPTH_STENCIL_STATE_POINTERS,
                 ds_offset);
         gen7_3dstate_pointer(cmd,
                 GEN6_RENDER_OPCODE_3DSTATE_CC_STATE_POINTERS, cc_offset);

         gen7_3dstate_pointer(cmd,
                 GEN7_RENDER_OPCODE_3DSTATE_VIEWPORT_STATE_POINTERS_CC,
                 cc_vp_offset);
     } else {
         /* 3DSTATE_CC_STATE_POINTERS */
         gen6_3DSTATE_CC_STATE_POINTERS(cmd, blend_offset, ds_offset, cc_offset);

         /* 3DSTATE_VIEWPORT_STATE_POINTERS */
         cmd_batch_pointer(cmd, 4, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VIEWPORT_STATE_POINTERS) | (4 - 2) |
                 GEN6_VP_PTR_DW0_CC_CHANGED;
         dw[1] = 0;
         dw[2] = 0;
         dw[3] = cc_vp_offset;
     }
 }

 static void gen6_meta_surface_states(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t binding_table[2] = { 0, 0 };
     uint32_t offset;
     const uint32_t sba_offset =
         cmd->writers[INTEL_CMD_WRITER_SURFACE].sba_offset;

     CMD_ASSERT(cmd, 6, 7.5);

     if (meta->mode == INTEL_CMD_META_DEPTH_STENCIL_RECT)
         return;

     /* SURFACE_STATEs */
     if (meta->src.valid) {
         offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_SURFACE,
                 GEN6_ALIGNMENT_SURFACE_STATE,
                 meta->src.surface_len, meta->src.surface);

         cmd_reserve_reloc(cmd, 1);
         if (meta->src.reloc_flags & INTEL_CMD_RELOC_TARGET_IS_WRITER) {
             cmd_surface_reloc_writer(cmd, offset, 1,
                     meta->src.reloc_target, meta->src.reloc_offset);
         } else {
             cmd_surface_reloc(cmd, offset, 1,
                     (struct intel_bo *) meta->src.reloc_target,
                     meta->src.reloc_offset, meta->src.reloc_flags);
         }

         binding_table[0] = offset - sba_offset;
     }
     if (meta->dst.valid) {
         offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_SURFACE,
                 GEN6_ALIGNMENT_SURFACE_STATE,
                 meta->dst.surface_len, meta->dst.surface);

         cmd_reserve_reloc(cmd, 1);
         cmd_surface_reloc(cmd, offset, 1,
                 (struct intel_bo *) meta->dst.reloc_target,
                 meta->dst.reloc_offset, meta->dst.reloc_flags);

         binding_table[1] = offset - sba_offset;
     }

     /* BINDING_TABLE */
     offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_BINDING_TABLE,
             GEN6_ALIGNMENT_BINDING_TABLE_STATE,
             2, binding_table);

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         const int subop = (meta->mode == INTEL_CMD_META_VS_POINTS) ?
             GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_VS :
             GEN7_RENDER_OPCODE_3DSTATE_BINDING_TABLE_POINTERS_PS;
         gen7_3dstate_pointer(cmd, subop, offset - sba_offset);
     } else {
         /* 3DSTATE_BINDING_TABLE_POINTERS */
         if (meta->mode == INTEL_CMD_META_VS_POINTS)
             gen6_3DSTATE_BINDING_TABLE_POINTERS(cmd, offset - sba_offset, 0, 0);
         else
             gen6_3DSTATE_BINDING_TABLE_POINTERS(cmd, 0, 0, offset - sba_offset);
     }
 }

 static void gen6_meta_urb(struct intel_cmd *cmd)
 {
     const int vs_entry_count = (cmd->dev->gpu->gt == 2) ? 256 : 128;
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 6);

     /* 3DSTATE_URB */
     cmd_batch_pointer(cmd, 3, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_URB) | (3 - 2);
     dw[1] = vs_entry_count << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT;
     dw[2] = 0;
 }

 static void gen7_meta_urb(struct intel_cmd *cmd)
 {
     const int pcb_alloc = (cmd->dev->gpu->gt == 3) ? 16 : 8;
     const int urb_offset = pcb_alloc / 8;
     int vs_entry_count;
     uint32_t *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     cmd_wa_gen7_pre_vs_depth_stall_write(cmd);

     switch (cmd_gen(cmd)) {
     case INTEL_GEN(7.5):
         vs_entry_count = (cmd->dev->gpu->gt >= 2) ? 1664 : 640;
         break;
     case INTEL_GEN(7):
     default:
         vs_entry_count = (cmd->dev->gpu->gt == 2) ? 704 : 512;
         break;
     }

     /* 3DSTATE_URB_x */
     cmd_batch_pointer(cmd, 8, &dw);

     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_VS) | (2 - 2);
     dw[1] = urb_offset << GEN7_URB_DW1_OFFSET__SHIFT |
             vs_entry_count;
     dw += 2;

     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_HS) | (2 - 2);
     dw[1] = urb_offset << GEN7_URB_DW1_OFFSET__SHIFT;
     dw += 2;

     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_DS) | (2 - 2);
     dw[1] = urb_offset << GEN7_URB_DW1_OFFSET__SHIFT;
     dw += 2;

     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_GS) | (2 - 2);
     dw[1] = urb_offset << GEN7_URB_DW1_OFFSET__SHIFT;
     dw += 2;
 }

 static void gen6_meta_vf(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t vb_start, vb_end, vb_stride;
     int ve_format, ve_z_source;
     uint32_t *dw;
     uint32_t pos;

     CMD_ASSERT(cmd, 6, 7.5);

     switch (meta->mode) {
     case INTEL_CMD_META_VS_POINTS:
         cmd_batch_pointer(cmd, 3, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_ELEMENTS) | (3 - 2);
         dw[1] = GEN6_VE_DW0_VALID;
         dw[2] = GEN6_VFCOMP_STORE_VID << GEN6_VE_DW1_COMP0__SHIFT |
                 GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP1__SHIFT |
                 GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP2__SHIFT |
                 GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP3__SHIFT;
         return;
         break;
     case INTEL_CMD_META_FS_RECT:
         {
             uint32_t vertices[3][2];

             vertices[0][0] = meta->dst.x + meta->width;
             vertices[0][1] = meta->dst.y + meta->height;
             vertices[1][0] = meta->dst.x;
             vertices[1][1] = meta->dst.y + meta->height;
             vertices[2][0] = meta->dst.x;
             vertices[2][1] = meta->dst.y;

             vb_start = cmd_state_write(cmd, INTEL_CMD_ITEM_BLOB, 32,
                     sizeof(vertices) / 4, (const uint32_t *) vertices);

             vb_end = vb_start + sizeof(vertices) - 1;
             vb_stride = sizeof(vertices[0]);
             ve_z_source = GEN6_VFCOMP_STORE_0;
             ve_format = GEN6_FORMAT_R32G32_USCALED;
         }
         break;
     case INTEL_CMD_META_DEPTH_STENCIL_RECT:
         {
             float vertices[3][3];

             vertices[0][0] = (float) (meta->dst.x + meta->width);
             vertices[0][1] = (float) (meta->dst.y + meta->height);
             vertices[0][2] = u_uif(meta->clear_val[0]);
             vertices[1][0] = (float) meta->dst.x;
             vertices[1][1] = (float) (meta->dst.y + meta->height);
             vertices[1][2] = u_uif(meta->clear_val[0]);
             vertices[2][0] = (float) meta->dst.x;
             vertices[2][1] = (float) meta->dst.y;
             vertices[2][2] = u_uif(meta->clear_val[0]);

             vb_start = cmd_state_write(cmd, INTEL_CMD_ITEM_BLOB, 32,
                     sizeof(vertices) / 4, (const uint32_t *) vertices);

             vb_end = vb_start + sizeof(vertices) - 1;
             vb_stride = sizeof(vertices[0]);
             ve_z_source = GEN6_VFCOMP_STORE_SRC;
             ve_format = GEN6_FORMAT_R32G32B32_FLOAT;
         }
         break;
     default:
         assert(!"unknown meta mode");
         return;
         break;
     }

     /* 3DSTATE_VERTEX_BUFFERS */
     pos = cmd_batch_pointer(cmd, 5, &dw);

     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_BUFFERS) | (5 - 2);
     dw[1] = vb_stride;
     if (cmd_gen(cmd) >= INTEL_GEN(7))
         dw[1] |= GEN7_VB_DW0_ADDR_MODIFIED;

     cmd_reserve_reloc(cmd, 2);
     cmd_batch_reloc_writer(cmd, pos + 2, INTEL_CMD_WRITER_STATE, vb_start);
     cmd_batch_reloc_writer(cmd, pos + 3, INTEL_CMD_WRITER_STATE, vb_end);

     dw[4] = 0;

     /* 3DSTATE_VERTEX_ELEMENTS */
     cmd_batch_pointer(cmd, 5, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_ELEMENTS) | (5 - 2);
     dw[1] = GEN6_VE_DW0_VALID;
     dw[2] = GEN6_VFCOMP_STORE_0 << GEN6_VE_DW1_COMP0__SHIFT | /* Reserved */
             GEN6_VFCOMP_STORE_0 << GEN6_VE_DW1_COMP1__SHIFT | /* Render Target Array Index */
             GEN6_VFCOMP_STORE_0 << GEN6_VE_DW1_COMP2__SHIFT | /* Viewport Index */
             GEN6_VFCOMP_STORE_0 << GEN6_VE_DW1_COMP3__SHIFT;  /* Point Width */
     dw[3] = GEN6_VE_DW0_VALID |
             ve_format << GEN6_VE_DW0_FORMAT__SHIFT;
     dw[4] = GEN6_VFCOMP_STORE_SRC  << GEN6_VE_DW1_COMP0__SHIFT |
             GEN6_VFCOMP_STORE_SRC  << GEN6_VE_DW1_COMP1__SHIFT |
             ve_z_source            << GEN6_VE_DW1_COMP2__SHIFT |
             GEN6_VFCOMP_STORE_1_FP << GEN6_VE_DW1_COMP3__SHIFT;
 }

 static uint32_t gen6_meta_vs_constants(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     /* one GPR */
     uint32_t consts[8];
     uint32_t const_count;

     CMD_ASSERT(cmd, 6, 7.5);

     switch (meta->shader_id) {
     case INTEL_DEV_META_VS_FILL_MEM:
         consts[0] = meta->dst.x;
         consts[1] = meta->clear_val[0];
         const_count = 2;
         break;
     case INTEL_DEV_META_VS_COPY_MEM:
     case INTEL_DEV_META_VS_COPY_MEM_UNALIGNED:
         consts[0] = meta->dst.x;
         consts[1] = meta->src.x;
         const_count = 2;
         break;
     case INTEL_DEV_META_VS_COPY_R8_TO_MEM:
     case INTEL_DEV_META_VS_COPY_R16_TO_MEM:
     case INTEL_DEV_META_VS_COPY_R32_TO_MEM:
     case INTEL_DEV_META_VS_COPY_R32G32_TO_MEM:
     case INTEL_DEV_META_VS_COPY_R32G32B32A32_TO_MEM:
         consts[0] = meta->src.x;
         consts[1] = meta->src.y;
         consts[2] = meta->width;
         consts[3] = meta->dst.x;
         const_count = 4;
         break;
     default:
         assert(!"unknown meta shader id");
         const_count = 0;
         break;
     }

     /* this can be skipped but it makes state dumping prettier */
     memset(&consts[const_count], 0, sizeof(consts[0]) * (8 - const_count));

     return cmd_state_write(cmd, INTEL_CMD_ITEM_BLOB, 32, 8, consts);
 }

 static void gen6_meta_vs(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     const struct intel_pipeline_shader *sh =
         intel_dev_get_meta_shader(cmd->dev, meta->shader_id);
     uint32_t offset, *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     if (meta->mode != INTEL_CMD_META_VS_POINTS) {
         uint32_t cmd_len;

         /* 3DSTATE_CONSTANT_VS */
         cmd_len = (cmd_gen(cmd) >= INTEL_GEN(7)) ? 7 : 5;
         cmd_batch_pointer(cmd, cmd_len, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_VS) | (cmd_len - 2);
         memset(&dw[1], 0, sizeof(*dw) * (cmd_len - 1));

         /* 3DSTATE_VS */
         cmd_batch_pointer(cmd, 6, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (6 - 2);
         memset(&dw[1], 0, sizeof(*dw) * (6 - 1));

         return;
     }

     assert(meta->dst.valid && sh->uses == INTEL_SHADER_USE_VID);

     /* 3DSTATE_CONSTANT_VS */
     offset = gen6_meta_vs_constants(cmd);
     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         cmd_batch_pointer(cmd, 7, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_VS) | (7 - 2);
         dw[1] = 1 << GEN7_CONSTANT_DW1_BUFFER0_READ_LEN__SHIFT;
         dw[2] = 0;
         dw[3] = offset | GEN7_MOCS_L3_WB;
         dw[4] = 0;
         dw[5] = 0;
         dw[6] = 0;
     } else {
         cmd_batch_pointer(cmd, 5, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_VS) | (5 - 2) |
                 1 << GEN6_CONSTANT_DW0_BUFFER_ENABLES__SHIFT;
         dw[1] = offset;
         dw[2] = 0;
         dw[3] = 0;
         dw[4] = 0;
     }

     /* 3DSTATE_VS */
     offset = emit_shader(cmd, sh);
     cmd_batch_pointer(cmd, 6, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (6 - 2);
     dw[1] = offset;
     dw[2] = GEN6_THREADDISP_SPF |
             (sh->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
              sh->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
     dw[3] = 0; /* scratch */
     dw[4] = sh->urb_grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
             1 << GEN6_VS_DW4_URB_READ_LEN__SHIFT;

     dw[5] = GEN6_VS_DW5_CACHE_DISABLE |
             GEN6_VS_DW5_VS_ENABLE;
     if (cmd_gen(cmd) >= INTEL_GEN(7.5))
         dw[5] |= (sh->max_threads - 1) << GEN75_VS_DW5_MAX_THREADS__SHIFT;
     else
         dw[5] |= (sh->max_threads - 1) << GEN6_VS_DW5_MAX_THREADS__SHIFT;

     assert(!sh->per_thread_scratch_size);
 }

 static void gen6_meta_disabled(struct intel_cmd *cmd)
 {
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 6);

     /* 3DSTATE_CONSTANT_GS */
     cmd_batch_pointer(cmd, 5, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_GS) | (5 - 2);
     dw[1] = 0;
     dw[2] = 0;
     dw[3] = 0;
     dw[4] = 0;

     /* 3DSTATE_GS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (7 - 2);
     dw[1] = 0;
     dw[2] = 0;
     dw[3] = 0;
     dw[4] = 1 << GEN6_GS_DW4_URB_READ_LEN__SHIFT;
     dw[5] = GEN6_GS_DW5_STATISTICS;
     dw[6] = 0;

     /* 3DSTATE_SF */
     cmd_batch_pointer(cmd, 20, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (20 - 2);
     dw[1] = 1 << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
     memset(&dw[2], 0, 18 * sizeof(*dw));
 }

 static void gen7_meta_disabled(struct intel_cmd *cmd)
 {
     uint32_t *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     /* 3DSTATE_CONSTANT_HS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_CONSTANT_HS) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_HS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_TE */
     cmd_batch_pointer(cmd, 4, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_TE) | (4 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (4 - 1));

     /* 3DSTATE_CONSTANT_DS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_CONSTANT_DS) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_DS */
     cmd_batch_pointer(cmd, 6, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (6 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (6 - 1));

     /* 3DSTATE_CONSTANT_GS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_GS) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_GS */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_STREAMOUT */
     cmd_batch_pointer(cmd, 3, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_STREAMOUT) | (3 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (3 - 1));

     /* 3DSTATE_SF */
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (7 - 2);
     memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

     /* 3DSTATE_SBE */
     cmd_batch_pointer(cmd, 14, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (14 - 2);
     dw[1] = 1 << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
     memset(&dw[2], 0, sizeof(*dw) * (14 - 2));
 }

 static void gen6_meta_clip(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t *dw;

     /* 3DSTATE_CLIP */
     cmd_batch_pointer(cmd, 4, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (4 - 2);
     dw[1] = 0;
     if (meta->mode == INTEL_CMD_META_VS_POINTS) {
         dw[2] = GEN6_CLIP_DW2_CLIP_ENABLE |
                 GEN6_CLIP_DW2_CLIPMODE_REJECT_ALL;
     } else {
         dw[2] = 0;
     }
     dw[3] = 0;
 }

 static void gen6_meta_wm(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t *dw;

     CMD_ASSERT(cmd, 6, 7.5);

     cmd_wa_gen6_pre_multisample_depth_flush(cmd);

     /* 3DSTATE_MULTISAMPLE */
     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         cmd_batch_pointer(cmd, 4, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (4 - 2);
         dw[1] =
             (meta->sample_count <= 1) ? GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1 :
             (meta->sample_count <= 4) ? GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4 :
                                         GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
         dw[2] = 0;
         dw[3] = 0;
     } else {
         cmd_batch_pointer(cmd, 3, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (3 - 2);
         dw[1] = (meta->sample_count <= 1) ? GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1 :
                                        GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
         dw[2] = 0;
     }

     /* 3DSTATE_SAMPLE_MASK */
     cmd_batch_pointer(cmd, 2, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (2 - 2);
     dw[1] = (1 << meta->sample_count) - 1;

     /* 3DSTATE_DRAWING_RECTANGLE */
     cmd_batch_pointer(cmd, 4, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_DRAWING_RECTANGLE) | (4 - 2);
     if (meta->mode == INTEL_CMD_META_VS_POINTS) {
         /* unused */
         dw[1] = 0;
         dw[2] = 0;
     } else {
         dw[1] = meta->dst.y << 16 | meta->dst.x;
         dw[2] = (meta->dst.y + meta->height - 1) << 16 |
                 (meta->dst.x + meta->width - 1);
     }
     dw[3] = 0;
 }

 static uint32_t gen6_meta_ps_constants(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     uint32_t offset_x, offset_y;
     /* one GPR */
     uint32_t consts[8];
     uint32_t const_count;

     CMD_ASSERT(cmd, 6, 7.5);

     /* underflow is fine here */
     offset_x = meta->src.x - meta->dst.x;
     offset_y = meta->src.y - meta->dst.y;

     switch (meta->shader_id) {
     case INTEL_DEV_META_FS_COPY_MEM:
     case INTEL_DEV_META_FS_COPY_1D:
     case INTEL_DEV_META_FS_COPY_1D_ARRAY:
     case INTEL_DEV_META_FS_COPY_2D:
     case INTEL_DEV_META_FS_COPY_2D_ARRAY:
     case INTEL_DEV_META_FS_COPY_2D_MS:
         consts[0] = offset_x;
         consts[1] = offset_y;
         consts[2] = meta->src.layer;
         consts[3] = meta->src.lod;
         const_count = 4;
         break;
     case INTEL_DEV_META_FS_COPY_1D_TO_MEM:
     case INTEL_DEV_META_FS_COPY_1D_ARRAY_TO_MEM:
     case INTEL_DEV_META_FS_COPY_2D_TO_MEM:
     case INTEL_DEV_META_FS_COPY_2D_ARRAY_TO_MEM:
     case INTEL_DEV_META_FS_COPY_2D_MS_TO_MEM:
         consts[0] = offset_x;
         consts[1] = offset_y;
         consts[2] = meta->src.layer;
         consts[3] = meta->src.lod;
         consts[4] = meta->src.x;
         consts[5] = meta->width;
         const_count = 6;
         break;
     case INTEL_DEV_META_FS_COPY_MEM_TO_IMG:
         consts[0] = offset_x;
         consts[1] = offset_y;
         consts[2] = meta->width;
         const_count = 3;
         break;
     case INTEL_DEV_META_FS_CLEAR_COLOR:
         consts[0] = meta->clear_val[0];
         consts[1] = meta->clear_val[1];
         consts[2] = meta->clear_val[2];
         consts[3] = meta->clear_val[3];
         const_count = 4;
         break;
     case INTEL_DEV_META_FS_CLEAR_DEPTH:
         consts[0] = meta->clear_val[0];
         consts[1] = meta->clear_val[1];
         const_count = 2;
         break;
     case INTEL_DEV_META_FS_RESOLVE_2X:
     case INTEL_DEV_META_FS_RESOLVE_4X:
     case INTEL_DEV_META_FS_RESOLVE_8X:
     case INTEL_DEV_META_FS_RESOLVE_16X:
         consts[0] = offset_x;
         consts[1] = offset_y;
         const_count = 2;
         break;
     default:
         assert(!"unknown meta shader id");
         const_count = 0;
         break;
     }

     /* this can be skipped but it makes state dumping prettier */
     memset(&consts[const_count], 0, sizeof(consts[0]) * (8 - const_count));

     return cmd_state_write(cmd, INTEL_CMD_ITEM_BLOB, 32, 8, consts);
 }

 static void gen6_meta_ps(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     const struct intel_pipeline_shader *sh =
         intel_dev_get_meta_shader(cmd->dev, meta->shader_id);
     uint32_t offset, *dw;

     CMD_ASSERT(cmd, 6, 6);

     if (meta->mode != INTEL_CMD_META_FS_RECT) {
         /* 3DSTATE_CONSTANT_PS */
         cmd_batch_pointer(cmd, 5, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_PS) | (5 - 2);
         dw[1] = 0;
         dw[2] = 0;
         dw[3] = 0;
         dw[4] = 0;

         /* 3DSTATE_WM */
         cmd_batch_pointer(cmd, 9, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (9 - 2);
         dw[1] = 0;
         dw[2] = 0;
         dw[3] = 0;

         switch (meta->ds.op) {
         case INTEL_CMD_META_DS_HIZ_CLEAR:
             dw[4] = GEN6_WM_DW4_DEPTH_CLEAR;
             break;
         case INTEL_CMD_META_DS_HIZ_RESOLVE:
             dw[4] = GEN6_WM_DW4_HIZ_RESOLVE;
             break;
         case INTEL_CMD_META_DS_RESOLVE:
             dw[4] = GEN6_WM_DW4_DEPTH_RESOLVE;
             break;
         default:
             dw[4] = 0;
             break;
         }

         dw[5] = (sh->max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT;
         dw[6] = 0;
         dw[7] = 0;
         dw[8] = 0;

         return;
     }

     /* a normal color write */
     assert(meta->dst.valid && !sh->uses);

     /* 3DSTATE_CONSTANT_PS */
     offset = gen6_meta_ps_constants(cmd);
     cmd_batch_pointer(cmd, 5, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_PS) | (5 - 2) |
             1 << GEN6_CONSTANT_DW0_BUFFER_ENABLES__SHIFT;
     dw[1] = offset;
     dw[2] = 0;
     dw[3] = 0;
     dw[4] = 0;

     /* 3DSTATE_WM */
     offset = emit_shader(cmd, sh);
     cmd_batch_pointer(cmd, 9, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (9 - 2);
     dw[1] = offset;
     dw[2] = (sh->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
              sh->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
     dw[3] = 0; /* scratch */
     dw[4] = sh->urb_grf_start << GEN6_WM_DW4_URB_GRF_START0__SHIFT;
     dw[5] = (sh->max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT |
             GEN6_WM_DW5_PS_DISPATCH_ENABLE |
             GEN6_PS_DISPATCH_16 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;

     dw[6] = sh->in_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
             GEN6_WM_DW6_PS_POSOFFSET_NONE |
             GEN6_WM_DW6_ZW_INTERP_PIXEL |
             sh->barycentric_interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT |
             GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT;
     if (meta->sample_count > 1) {
         dw[6] |= GEN6_WM_DW6_MSRASTMODE_ON_PATTERN |
                  GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
     } else {
         dw[6] |= GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL |
                  GEN6_WM_DW6_MSDISPMODE_PERSAMPLE;
     }
     dw[7] = 0;
     dw[8] = 0;

     assert(!sh->per_thread_scratch_size);
 }

 static void gen7_meta_ps(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     const struct intel_pipeline_shader *sh =
         intel_dev_get_meta_shader(cmd->dev, meta->shader_id);
     uint32_t offset, *dw;

     CMD_ASSERT(cmd, 7, 7.5);

     if (meta->mode != INTEL_CMD_META_FS_RECT) {
         /* 3DSTATE_WM */
         cmd_batch_pointer(cmd, 3, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (3 - 2);

         switch (meta->ds.op) {
         case INTEL_CMD_META_DS_HIZ_CLEAR:
             dw[1] = GEN7_WM_DW1_DEPTH_CLEAR;
             break;
         case INTEL_CMD_META_DS_HIZ_RESOLVE:
             dw[1] = GEN7_WM_DW1_HIZ_RESOLVE;
             break;
         case INTEL_CMD_META_DS_RESOLVE:
             dw[1] = GEN7_WM_DW1_DEPTH_RESOLVE;
             break;
         default:
             dw[1] = 0;
             break;
         }

         dw[2] = 0;

         /* 3DSTATE_CONSTANT_GS */
         cmd_batch_pointer(cmd, 7, &dw);
         dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_PS) | (7 - 2);
         memset(&dw[1], 0, sizeof(*dw) * (7 - 1));

         /* 3DSTATE_PS */
         cmd_batch_pointer(cmd, 8, &dw);
         dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (8 - 2);
         dw[1] = 0;
         dw[2] = 0;
         dw[3] = 0;
         /* required to avoid hangs */
         dw[4] = GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT |
                 (sh->max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
         dw[5] = 0;
         dw[6] = 0;
         dw[7] = 0;

         return;
     }

     /* a normal color write */
     assert(meta->dst.valid && !sh->uses);

     /* 3DSTATE_WM */
     cmd_batch_pointer(cmd, 3, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (3 - 2);
     dw[1] = GEN7_WM_DW1_PS_DISPATCH_ENABLE |
             GEN7_WM_DW1_ZW_INTERP_PIXEL |
             sh->barycentric_interps << GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT |
             GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
     dw[2] = 0;

     /* 3DSTATE_CONSTANT_PS */
     offset = gen6_meta_ps_constants(cmd);
     cmd_batch_pointer(cmd, 7, &dw);
     dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CONSTANT_PS) | (7 - 2);
     dw[1] = 1 << GEN7_CONSTANT_DW1_BUFFER0_READ_LEN__SHIFT;
     dw[2] = 0;
     dw[3] = offset | GEN7_MOCS_L3_WB;
     dw[4] = 0;
     dw[5] = 0;
     dw[6] = 0;

     /* 3DSTATE_PS */
     offset = emit_shader(cmd, sh);
     cmd_batch_pointer(cmd, 8, &dw);
     dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (8 - 2);
     dw[1] = offset;
     dw[2] = (sh->sampler_count + 3) / 4 << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
              sh->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
     dw[3] = 0; /* scratch */

     dw[4] = GEN7_PS_DW4_PUSH_CONSTANT_ENABLE |
             GEN7_PS_DW4_POSOFFSET_NONE |
             GEN6_PS_DISPATCH_16 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;

     if (cmd_gen(cmd) >= INTEL_GEN(7.5)) {
         dw[4] |= (sh->max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
         dw[4] |= ((1 << meta->sample_count) - 1) <<
             GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
     } else {
         dw[4] |= (sh->max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
     }

     dw[5] = sh->urb_grf_start << GEN7_PS_DW5_URB_GRF_START0__SHIFT;
     dw[6] = 0;
     dw[7] = 0;

     assert(!sh->per_thread_scratch_size);
 }

 static void gen6_meta_depth_buffer(struct intel_cmd *cmd)
 {
     const struct intel_cmd_meta *meta = cmd->bind.meta;
     const struct intel_att_view *view = &meta->ds.view;

     CMD_ASSERT(cmd, 6, 7.5);

     if (!view) {
         /* all zeros */
         static const struct intel_att_view null_view;
         view = &null_view;
     }

     cmd_wa_gen6_pre_ds_flush(cmd);
     gen6_3DSTATE_DEPTH_BUFFER(cmd, view, meta->ds.optimal);
     gen6_3DSTATE_STENCIL_BUFFER(cmd, view, meta->ds.optimal);
     gen6_3DSTATE_HIER_DEPTH_BUFFER(cmd, view, meta->ds.optimal);

     if (cmd_gen(cmd) >= INTEL_GEN(7))
         gen7_3DSTATE_CLEAR_PARAMS(cmd, 0);
     else
         gen6_3DSTATE_CLEAR_PARAMS(cmd, 0);
 }

 static bool cmd_alloc_dset_data(struct intel_cmd *cmd,
                                 struct intel_cmd_dset_data *data,
                                 const struct intel_pipeline_layout *pipeline_layout)
 {
     if (data->set_offset_count < pipeline_layout->layout_count) {
         if (data->set_offsets)
             intel_free(cmd, data->set_offsets);

         data->set_offsets = intel_alloc(cmd,
                 sizeof(data->set_offsets[0]) * pipeline_layout->layout_count,
                 sizeof(data->set_offsets[0]), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
         if (!data->set_offsets) {
             cmd_fail(cmd, VK_ERROR_OUT_OF_HOST_MEMORY);
             data->set_offset_count = 0;
             return false;
         }

         data->set_offset_count = pipeline_layout->layout_count;
     }

     if (data->dynamic_offset_count < pipeline_layout->total_dynamic_desc_count) {
         if (data->dynamic_offsets)
             intel_free(cmd, data->dynamic_offsets);

         data->dynamic_offsets = intel_alloc(cmd,
                 sizeof(data->dynamic_offsets[0]) * pipeline_layout->total_dynamic_desc_count,
                 sizeof(data->dynamic_offsets[0]), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
         if (!data->dynamic_offsets) {
             cmd_fail(cmd, VK_ERROR_OUT_OF_HOST_MEMORY);
             data->dynamic_offset_count = 0;
             return false;
         }

         data->dynamic_offset_count = pipeline_layout->total_dynamic_desc_count;
     }

     return true;
 }

 static void cmd_bind_dynamic_state(struct intel_cmd *cmd,
                                    const struct intel_pipeline *pipeline)

 {
     VkFlags use_flags = pipeline->state.use_pipeline_dynamic_state;
     if (!use_flags) {
         return;
     }
     cmd->bind.state.use_pipeline_dynamic_state = use_flags;
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_VIEWPORT) {
         const struct intel_dynamic_viewport *viewport = &pipeline->state.viewport;
         intel_set_viewport(cmd, viewport->first_viewport, viewport->viewport_count, viewport->viewports);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_SCISSOR) {
         const struct intel_dynamic_viewport *viewport = &pipeline->state.viewport;
         intel_set_scissor(cmd, viewport->first_scissor, viewport->scissor_count, viewport->scissors);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_LINE_WIDTH) {
         intel_set_line_width(cmd, pipeline->state.line_width.line_width);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_DEPTH_BIAS) {
         const struct intel_dynamic_depth_bias *s = &pipeline->state.depth_bias;
         intel_set_depth_bias(cmd, s->depth_bias, s->depth_bias_clamp, s->slope_scaled_depth_bias);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_BLEND_CONSTANTS) {
         const struct intel_dynamic_blend *s = &pipeline->state.blend;
         intel_set_blend_constants(cmd, s->blend_const);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_DEPTH_BOUNDS) {
         const struct intel_dynamic_depth_bounds *s = &pipeline->state.depth_bounds;
         intel_set_depth_bounds(cmd, s->min_depth_bounds, s->max_depth_bounds);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_STENCIL_COMPARE_MASK) {
         const struct intel_dynamic_stencil *s = &pipeline->state.stencil;
         intel_set_stencil_compare_mask(cmd, VK_STENCIL_FACE_FRONT_BIT, s->front.stencil_compare_mask);
         intel_set_stencil_compare_mask(cmd, VK_STENCIL_FACE_BACK_BIT, s->back.stencil_compare_mask);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_STENCIL_WRITE_MASK) {
         const struct intel_dynamic_stencil *s = &pipeline->state.stencil;
         intel_set_stencil_write_mask(cmd, VK_STENCIL_FACE_FRONT_BIT, s->front.stencil_write_mask);
         intel_set_stencil_write_mask(cmd, VK_STENCIL_FACE_BACK_BIT, s->back.stencil_write_mask);
     }
     if (use_flags & INTEL_USE_PIPELINE_DYNAMIC_STENCIL_REFERENCE) {
         const struct intel_dynamic_stencil *s = &pipeline->state.stencil;
         intel_set_stencil_reference(cmd, VK_STENCIL_FACE_FRONT_BIT, s->front.stencil_reference);
         intel_set_stencil_reference(cmd, VK_STENCIL_FACE_BACK_BIT, s->back.stencil_reference);
     }
 }

 static void cmd_bind_graphics_pipeline(struct intel_cmd *cmd,
                                        const struct intel_pipeline *pipeline)
 {
     cmd->bind.pipeline.graphics = pipeline;

     cmd_bind_dynamic_state(cmd, pipeline);

     cmd_alloc_dset_data(cmd, &cmd->bind.dset.graphics_data,
             pipeline->pipeline_layout);
 }

 static void cmd_bind_compute_pipeline(struct intel_cmd *cmd,
                                       const struct intel_pipeline *pipeline)
 {
     cmd->bind.pipeline.compute = pipeline;

     cmd_alloc_dset_data(cmd, &cmd->bind.dset.compute_data,
             pipeline->pipeline_layout);
 }

 static void cmd_copy_dset_data(struct intel_cmd *cmd,
                                struct intel_cmd_dset_data *data,
                                const struct intel_pipeline_layout *pipeline_layout,
                                uint32_t index,
                                const struct intel_desc_set *set,
                                const uint32_t *dynamic_offsets)
 {
     const struct intel_desc_layout *layout = pipeline_layout->layouts[index];

     assert(index < data->set_offset_count);
     data->set_offsets[index] = set->region_begin;

     if (layout->dynamic_desc_count) {
         assert(pipeline_layout->dynamic_desc_indices[index] +
                 layout->dynamic_desc_count - 1 < data->dynamic_offset_count);

         memcpy(&data->dynamic_offsets[pipeline_layout->dynamic_desc_indices[index]],
                 dynamic_offsets,
                 sizeof(dynamic_offsets[0]) * layout->dynamic_desc_count);
     }
 }

 static void cmd_bind_vertex_data(struct intel_cmd *cmd,
                                  const struct intel_buf *buf,
                                  VkDeviceSize offset, uint32_t binding)
 {
     /* TODOVV: verify */
     assert(!(binding >= ARRAY_SIZE(cmd->bind.vertex.buf)) && "binding exceeds buf size");

     cmd->bind.vertex.buf[binding] = buf;
     cmd->bind.vertex.offset[binding] = offset;
 }

 static void cmd_bind_index_data(struct intel_cmd *cmd,
                                 const struct intel_buf *buf,
                                 VkDeviceSize offset, VkIndexType type)
 {
     cmd->bind.index.buf = buf;
     cmd->bind.index.offset = offset;
     cmd->bind.index.type = type;
 }

 static uint32_t cmd_get_max_surface_write(const struct intel_cmd *cmd)
 {
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     struct intel_pipeline_rmap *rmaps[5] = {
         pipeline->vs.rmap,
         pipeline->tcs.rmap,
         pipeline->tes.rmap,
         pipeline->gs.rmap,
         pipeline->fs.rmap,
     };
     uint32_t max_write;
     int i;

     STATIC_ASSERT(GEN6_ALIGNMENT_SURFACE_STATE >= GEN6_SURFACE_STATE__SIZE);
     STATIC_ASSERT(GEN6_ALIGNMENT_SURFACE_STATE >=
             GEN6_ALIGNMENT_BINDING_TABLE_STATE);

     /* pad first */
     max_write = GEN6_ALIGNMENT_SURFACE_STATE;

     for (i = 0; i < ARRAY_SIZE(rmaps); i++) {
         const struct intel_pipeline_rmap *rmap = rmaps[i];
         const uint32_t surface_count = (rmap) ?
             rmap->rt_count + rmap->texture_resource_count +
             rmap->resource_count + rmap->uav_count : 0;

         if (surface_count) {
             /* SURFACE_STATEs */
             max_write += GEN6_ALIGNMENT_SURFACE_STATE * surface_count;

             /* BINDING_TABLE_STATE */
             max_write += u_align(sizeof(uint32_t) * surface_count,
                     GEN6_ALIGNMENT_SURFACE_STATE);
         }
     }

     return max_write;
 }

 static void cmd_adjust_state_base_address(struct intel_cmd *cmd)
 {
     struct intel_cmd_writer *writer = &cmd->writers[INTEL_CMD_WRITER_SURFACE];
     const uint32_t cur_surface_offset = writer->used - writer->sba_offset;
     uint32_t max_surface_write;

     /* enough for src and dst SURFACE_STATEs plus BINDING_TABLE_STATE */
     if (cmd->bind.meta)
         max_surface_write = 64 * sizeof(uint32_t);
     else
         max_surface_write = cmd_get_max_surface_write(cmd);

     /* there is a 64KB limit on BINDING_TABLE_STATEs */
     if (cur_surface_offset + max_surface_write > 64 * 1024) {
         /* SBA expects page-aligned addresses */
         writer->sba_offset = writer->used & ~0xfff;

         assert((writer->used & 0xfff) + max_surface_write <= 64 * 1024);

         cmd_batch_state_base_address(cmd);
     }
 }

 static void cmd_draw(struct intel_cmd *cmd,
                      uint32_t vertex_start,
                      uint32_t vertex_count,
                      uint32_t instance_start,
                      uint32_t instance_count,
                      bool indexed,
                      uint32_t vertex_base)
 {
     const struct intel_pipeline *p = cmd->bind.pipeline.graphics;
     const uint32_t surface_writer_used U_ASSERT_ONLY =
         cmd->writers[INTEL_CMD_WRITER_SURFACE].used;

     cmd_adjust_state_base_address(cmd);

     emit_bounded_states(cmd);

     /* sanity check on cmd_get_max_surface_write() */
     assert(cmd->writers[INTEL_CMD_WRITER_SURFACE].used -
             surface_writer_used <= cmd_get_max_surface_write(cmd));

     if (indexed) {
         assert(!(p->primitive_restart && !gen6_can_primitive_restart(cmd)) && "Primitive restart unsupported on this device");

         if (cmd_gen(cmd) >= INTEL_GEN(7.5)) {
             gen75_3DSTATE_VF(cmd, p->primitive_restart,
                     p->primitive_restart_index);
             gen6_3DSTATE_INDEX_BUFFER(cmd, cmd->bind.index.buf,
                     cmd->bind.index.offset, cmd->bind.index.type,
                     false);
         } else {
             gen6_3DSTATE_INDEX_BUFFER(cmd, cmd->bind.index.buf,
                     cmd->bind.index.offset, cmd->bind.index.type,
                     p->primitive_restart);
         }
     } else {
         assert(!vertex_base);
     }

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         gen7_3DPRIMITIVE(cmd, p->prim_type, indexed, vertex_count,
                 vertex_start, instance_count, instance_start, vertex_base);
     } else {
         gen6_3DPRIMITIVE(cmd, p->prim_type, indexed, vertex_count,
                 vertex_start, instance_count, instance_start, vertex_base);
     }

     cmd->bind.draw_count++;
     cmd->bind.render_pass_changed = false;
     /* need to re-emit all workarounds */
     cmd->bind.wa_flags = 0;

     if (intel_debug & INTEL_DEBUG_NOCACHE)
         cmd_batch_flush_all(cmd);
 }

 void cmd_draw_meta(struct intel_cmd *cmd, const struct intel_cmd_meta *meta)
 {
     cmd->bind.meta = meta;

     cmd_adjust_state_base_address(cmd);

     cmd_wa_gen6_pre_depth_stall_write(cmd);
     cmd_wa_gen6_pre_command_scoreboard_stall(cmd);

     gen6_meta_dynamic_states(cmd);
     gen6_meta_surface_states(cmd);

     if (cmd_gen(cmd) >= INTEL_GEN(7)) {
         gen7_meta_urb(cmd);
         gen6_meta_vf(cmd);
         gen6_meta_vs(cmd);
         gen7_meta_disabled(cmd);
         gen6_meta_clip(cmd);
         gen6_meta_wm(cmd);
         gen7_meta_ps(cmd);
         gen6_meta_depth_buffer(cmd);

         cmd_wa_gen7_post_command_cs_stall(cmd);
         cmd_wa_gen7_post_command_depth_stall(cmd);

         if (meta->mode == INTEL_CMD_META_VS_POINTS) {
             gen7_3DPRIMITIVE(cmd, GEN6_3DPRIM_POINTLIST, false,
                     meta->width * meta->height, 0, 1, 0, 0);
         } else {
             gen7_3DPRIMITIVE(cmd, GEN6_3DPRIM_RECTLIST, false, 3, 0, 1, 0, 0);
         }
     } else {
         gen6_meta_urb(cmd);
         gen6_meta_vf(cmd);
         gen6_meta_vs(cmd);
         gen6_meta_disabled(cmd);
         gen6_meta_clip(cmd);
         gen6_meta_wm(cmd);
         gen6_meta_ps(cmd);
         gen6_meta_depth_buffer(cmd);

         if (meta->mode == INTEL_CMD_META_VS_POINTS) {
             gen6_3DPRIMITIVE(cmd, GEN6_3DPRIM_POINTLIST, false,
                     meta->width * meta->height, 0, 1, 0, 0);
         } else {
             gen6_3DPRIMITIVE(cmd, GEN6_3DPRIM_RECTLIST, false, 3, 0, 1, 0, 0);
         }
     }

     cmd->bind.draw_count++;
     /* need to re-emit all workarounds */
     cmd->bind.wa_flags = 0;

     cmd->bind.meta = NULL;

     /* make the normal path believe the render pass has changed */
     cmd->bind.render_pass_changed = true;

     if (intel_debug & INTEL_DEBUG_NOCACHE)
         cmd_batch_flush_all(cmd);
 }

 static void cmd_exec(struct intel_cmd *cmd, struct intel_bo *bo)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
    uint32_t pos;

    assert(!(cmd_gen(cmd) < INTEL_GEN(7.5)) && "Invalid GPU version");

    pos = cmd_batch_pointer(cmd, cmd_len, &dw);
    dw[0] = GEN6_MI_CMD(MI_BATCH_BUFFER_START) | (cmd_len - 2) |
            GEN75_MI_BATCH_BUFFER_START_DW0_SECOND_LEVEL |
            GEN75_MI_BATCH_BUFFER_START_DW0_NON_PRIVILEGED |
            GEN6_MI_BATCH_BUFFER_START_DW0_USE_PPGTT;

    cmd_batch_reloc(cmd, pos + 1, bo, 0, 0);
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdBindPipeline(
     VkCommandBuffer                              commandBuffer,
     VkPipelineBindPoint                     pipelineBindPoint,
     VkPipeline                                pipeline)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);

     switch (pipelineBindPoint) {
     case VK_PIPELINE_BIND_POINT_COMPUTE:
         cmd_bind_compute_pipeline(cmd, intel_pipeline(pipeline));
         break;
     case VK_PIPELINE_BIND_POINT_GRAPHICS:
         cmd_bind_graphics_pipeline(cmd, intel_pipeline(pipeline));
         break;
     default:
         assert(!"unsupported pipelineBindPoint");
         break;
     }
 }


 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdBindDescriptorSets(
     VkCommandBuffer                             commandBuffer,
     VkPipelineBindPoint                     pipelineBindPoint,
     VkPipelineLayout                        layout,
     uint32_t                                firstSet,
     uint32_t                                descriptorSetCount,
     const VkDescriptorSet*                  pDescriptorSets,
     uint32_t                                dynamicOffsetCount,
     const uint32_t*                         pDynamicOffsets)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);
     const struct intel_pipeline_layout *pipeline_layout;
     struct intel_cmd_dset_data *data = NULL;
     uint32_t offset_count = 0;
     uint32_t i;

     pipeline_layout = intel_pipeline_layout(layout);

     switch (pipelineBindPoint) {
     case VK_PIPELINE_BIND_POINT_COMPUTE:
         data = &cmd->bind.dset.compute_data;
         break;
     case VK_PIPELINE_BIND_POINT_GRAPHICS:
         data = &cmd->bind.dset.graphics_data;
         break;
     default:
         assert(!"unsupported pipelineBindPoint");
         break;
     }

     cmd_alloc_dset_data(cmd, data, pipeline_layout);

     for (i = 0; i < descriptorSetCount; i++) {
         struct intel_desc_set *dset = intel_desc_set(pDescriptorSets[i]);

         offset_count += pipeline_layout->layouts[firstSet + i]->dynamic_desc_count;
         if (offset_count <= dynamicOffsetCount) {
             cmd_copy_dset_data(cmd, data, pipeline_layout, firstSet + i,
                     dset, pDynamicOffsets);
             pDynamicOffsets += pipeline_layout->layouts[firstSet + i]->dynamic_desc_count;
         }
     }
 }


 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdBindVertexBuffers(
     VkCommandBuffer                                 commandBuffer,
     uint32_t                                        firstBinding,
     uint32_t                                        bindingCount,
     const VkBuffer*                                 pBuffers,
     const VkDeviceSize*                             pOffsets)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);

     for (uint32_t i = 0; i < bindingCount; i++) {
         struct intel_buf *buf = intel_buf(pBuffers[i]);
         cmd_bind_vertex_data(cmd, buf, pOffsets[i], firstBinding + i);
     }
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdBindIndexBuffer(
     VkCommandBuffer                              commandBuffer,
     VkBuffer                                  buffer,
     VkDeviceSize                                offset,
     VkIndexType                              indexType)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);
     struct intel_buf *buf = intel_buf(buffer);

     cmd_bind_index_data(cmd, buf, offset, indexType);
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDraw(
     VkCommandBuffer                                 commandBuffer,
     uint32_t                                    vertexCount,
     uint32_t                                    instanceCount,
     uint32_t                                    firstVertex,
     uint32_t                                    firstInstance)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);

     cmd_draw(cmd, firstVertex, vertexCount,
             firstInstance, instanceCount, false, 0);
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexed(
     VkCommandBuffer                                 commandBuffer,
     uint32_t                                    indexCount,
     uint32_t                                    instanceCount,
     uint32_t                                    firstIndex,
     int32_t                                     vertexOffset,
     uint32_t                                    firstInstance)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);

     cmd_draw(cmd, firstIndex, indexCount,
             firstInstance, instanceCount, true, vertexOffset);
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirect(
     VkCommandBuffer                              commandBuffer,
     VkBuffer                                  buffer,
     VkDeviceSize                                offset,
     uint32_t                                    drawCount,
     uint32_t                                    stride)
 {
     assert(0 && "vkCmdDrawIndirect not implemented");
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirect(
     VkCommandBuffer                              commandBuffer,
     VkBuffer                                  buffer,
     VkDeviceSize                                offset,
     uint32_t                                    drawCount,
     uint32_t                                    stride)
 {
     assert(0 && "vkCmdDrawIndexedIndirect not implemented");
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDispatch(
     VkCommandBuffer                              commandBuffer,
     uint32_t                                    x,
     uint32_t                                    y,
     uint32_t                                    z)
 {
     assert(0 && "vkCmdDispatch not implemented");
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect(
     VkCommandBuffer                              commandBuffer,
     VkBuffer                                  buffer,
     VkDeviceSize                                offset)
 {
     assert(0 && "vkCmdDisatchIndirect not implemented");
 }

 VKAPI_ATTR void VKAPI_CALL vkCmdPushConstants(
     VkCommandBuffer                                 commandBuffer,
     VkPipelineLayout                            layout,
     VkShaderStageFlags                          stageFlags,
     uint32_t                                    offset,
     uint32_t                                    size,
     const void*                                 pValues)
 {
     /* TODO: Implement */
 }

 VKAPI_ATTR void VKAPI_CALL vkGetRenderAreaGranularity(
     VkDevice                                    device,
     VkRenderPass                                renderPass,
     VkExtent2D*                                 pGranularity)
 {
     pGranularity->height = 1;
     pGranularity->width = 1;
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdBeginRenderPass(
     VkCommandBuffer                                 commandBuffer,
     const VkRenderPassBeginInfo*                pRenderPassBegin,
     VkSubpassContents                        contents)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);
     const struct intel_render_pass *rp =
         intel_render_pass(pRenderPassBegin->renderPass);
     const struct intel_fb *fb = intel_fb(pRenderPassBegin->framebuffer);
     const struct intel_att_view *view;
     uint32_t i;

     /* TODOVV: */
     assert(!(!cmd->primary || rp->attachment_count != fb->view_count) && "Invalid RenderPass");

     cmd_begin_render_pass(cmd, rp, fb, 0, contents);

     for (i = 0; i < rp->attachment_count; i++) {
         const struct intel_render_pass_attachment *att = &rp->attachments[i];
         const VkClearValue *clear_val =
             &pRenderPassBegin->pClearValues[i];
         VkImageSubresourceRange range;

         view = fb->views[i];
         range.baseMipLevel = view->mipLevel;
         range.levelCount = 1;
         range.baseArrayLayer = view->baseArrayLayer;
         range.layerCount = view->array_size;
         range.aspectMask = 0;

         if (view->is_rt) {
             /* color */
             if (att->clear_on_load) {
                 range.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;

                 cmd_meta_clear_color_image(commandBuffer, view->img,
                         att->initial_layout, &clear_val->color, 1, &range);
             }
         } else {
             /* depth/stencil */
             if (att->clear_on_load) {
                 range.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
             }

             if (att->stencil_clear_on_load) {
                 range.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
             }

             if (range.aspectMask) {
                 cmd_meta_clear_depth_stencil_image(commandBuffer,
                         view->img, att->initial_layout,
                         clear_val->depthStencil.depth, clear_val->depthStencil.stencil,
                         1, &range);
             }
         }
     }
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdNextSubpass(
     VkCommandBuffer                                 commandBuffer,
     VkSubpassContents                        contents)
 {
     struct intel_cmd *cmd = intel_cmd(commandBuffer);
     const struct intel_render_pass U_ASSERT_ONLY *rp = cmd->bind.render_pass;

     /* TODOVV */
    assert(!(cmd->bind.render_pass_subpass >= rp->subpasses +
            rp->subpass_count - 1) && "Invalid RenderPassContents");

    cmd->bind.render_pass_changed = true;
    cmd->bind.render_pass_subpass++;
    cmd->bind.render_pass_contents = contents;
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdEndRenderPass(
     VkCommandBuffer                              commandBuffer)
 {
    struct intel_cmd *cmd = intel_cmd(commandBuffer);

    cmd_end_render_pass(cmd);
 }

 ICD_EXPORT VKAPI_ATTR void VKAPI_CALL vkCmdExecuteCommands(
     VkCommandBuffer                                 commandBuffer,
     uint32_t                                    commandBuffersCount,
     const VkCommandBuffer*                          pCommandBuffers)
 {
    struct intel_cmd *cmd = intel_cmd(commandBuffer);
    uint32_t i;

    /* TODOVV */
    assert(!(!cmd->bind.render_pass || cmd->bind.render_pass_contents !=
            VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS) && "Invalid RenderPass");

    for (i = 0; i < commandBuffersCount; i++) {
        const struct intel_cmd *secondary = intel_cmd(pCommandBuffers[i]);

        /* TODOVV: Move test to validation layer */
        assert(!(secondary->primary) && "Cannot be primary command buffer");

        cmd_exec(cmd, intel_cmd_get_batch(secondary, NULL));
    }

    if (i)
        cmd_batch_state_base_address(cmd);
 }