intel: emit 3DSTATE_SF and 3DSTATE_SBE

It looks like 3DSTATE_SBE could be moved to PSO.
diff --git a/icd/intel/cmd_pipeline.c b/icd/intel/cmd_pipeline.c
index fe07f30..6460f94 100644
--- a/icd/intel/cmd_pipeline.c
+++ b/icd/intel/cmd_pipeline.c
@@ -353,6 +353,183 @@
     cmd_batch_write(cmd, 0);
 }
 
+static void gen7_fill_3DSTATE_SF_body(const struct intel_cmd *cmd,
+                                      uint32_t body[6])
+{
+    const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
+    const struct intel_viewport_state *viewport = cmd->bind.state.viewport;
+    const struct intel_raster_state *raster = cmd->bind.state.raster;
+    const struct intel_msaa_state *msaa = cmd->bind.state.msaa;
+    uint32_t dw1, dw2, dw3;
+    int point_width;
+
+    CMD_ASSERT(cmd, 6, 7.5);
+
+    dw1 = GEN7_SF_DW1_STATISTICS |
+          GEN7_SF_DW1_DEPTH_OFFSET_SOLID |
+          GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME |
+          GEN7_SF_DW1_DEPTH_OFFSET_POINT |
+          GEN7_SF_DW1_VIEWPORT_ENABLE |
+          raster->cmd_sf_fill;
+
+    if (cmd_gen(cmd) >= INTEL_GEN(7)) {
+        int format;
+
+        switch (pipeline->db_format.channelFormat) {
+        case XGL_CH_FMT_R16:
+            format = GEN6_ZFORMAT_D16_UNORM;
+            break;
+        case XGL_CH_FMT_R32:
+        case XGL_CH_FMT_R32G8:
+            format = GEN6_ZFORMAT_D32_FLOAT;
+            break;
+        default:
+            assert(!"unknown depth format");
+            format = 0;
+            break;
+        }
+
+        dw1 |= format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
+    }
+
+    dw2 = raster->cmd_sf_cull;
+
+    if (msaa->sample_count > 1) {
+          dw2 |= 128 << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+                 GEN7_SF_DW2_MSRASTMODE_ON_PATTERN;
+    } else {
+          dw2 |= 0 << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+                 GEN7_SF_DW2_MSRASTMODE_OFF_PIXEL;
+    }
+
+    if (viewport->scissor_enable)
+        dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;
+
+    /* in U8.3 */
+    point_width = (int) (pipeline->pointSize * 8.0f + 0.5f);
+    point_width = U_CLAMP(point_width, 1, 2047);
+
+    dw3 = pipeline->provoking_vertex_tri << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+          pipeline->provoking_vertex_line << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+          pipeline->provoking_vertex_trifan << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT |
+          GEN7_SF_DW3_SUBPIXEL_8BITS |
+          GEN7_SF_DW3_USE_POINT_WIDTH |
+          point_width;
+
+    body[0] = dw1;
+    body[1] = dw2;
+    body[2] = dw3;
+    body[3] = raster->cmd_depth_offset_const;
+    body[4] = raster->cmd_depth_offset_scale;
+    body[5] = raster->cmd_depth_offset_clamp;
+}
+
+static void gen7_fill_3DSTATE_SBE_body(const struct intel_cmd *cmd,
+                                       uint32_t body[13])
+{
+    const struct intel_shader *vs =
+        intel_shader(cmd->bind.pipeline.graphics->vs.shader);
+    const struct intel_shader *fs =
+        intel_shader(cmd->bind.pipeline.graphics->fs.shader);
+    XGL_UINT attr_skip, attr_count;
+    XGL_UINT vue_offset, vue_len;
+    XGL_UINT i;
+    uint32_t dw1;
+
+    CMD_ASSERT(cmd, 6, 7.5);
+
+    /* VS outputs VUE header and position additionally */
+    assert(vs->out_count >= 2);
+    attr_skip = 2;
+    attr_count = vs->out_count - attr_skip;
+    assert(fs->in_count == attr_count);
+    assert(fs->in_count <= 32);
+
+    vue_offset = attr_skip / 2;
+    vue_len = (attr_count + 1) / 2;
+    if (!vue_len)
+        vue_len = 1;
+
+    dw1 = fs->in_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT |
+          vue_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT |
+          vue_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
+
+    body[0] = dw1;
+
+    for (i = 0; i < 8; i++) {
+        uint16_t hi, lo;
+
+        /* no attr swizzles */
+        if (i * 2 + 1 < fs->in_count) {
+            hi = i * 2 + 1;
+            lo = i * 2;
+        } else if (i * 2 < fs->in_count) {
+            hi = 0;
+            lo = i * 2;
+        } else {
+            hi = 0;
+            lo = 0;
+        }
+
+        body[1 + i] = hi << GEN7_SBE_ATTR_HIGH__SHIFT | lo;
+    }
+
+    body[9] = 0; /* point sprite enables */
+    body[10] = 0; /* constant interpolation enables */
+    body[11] = 0; /* WrapShortest enables */
+    body[12] = 0;
+}
+
+static void gen6_3DSTATE_SF(struct intel_cmd *cmd)
+{
+    const uint8_t cmd_len = 20;
+    const uint32_t dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_SF) |
+                         (cmd_len - 2);
+    uint32_t sf[6];
+    uint32_t sbe[13];
+
+    CMD_ASSERT(cmd, 6, 6);
+
+    gen7_fill_3DSTATE_SF_body(cmd, sf);
+    gen7_fill_3DSTATE_SBE_body(cmd, sbe);
+
+    cmd_batch_reserve(cmd, cmd_len);
+    cmd_batch_write(cmd, dw0);
+    cmd_batch_write(cmd, sbe[0]);
+    cmd_batch_write_n(cmd, sf, 6);
+    cmd_batch_write_n(cmd, &sbe[1], 12);
+}
+
+static void gen7_3DSTATE_SF(struct intel_cmd *cmd)
+{
+    const uint8_t cmd_len = 7;
+    uint32_t dw[7];
+
+    CMD_ASSERT(cmd, 7, 7.5);
+
+    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) |
+            (cmd_len - 2);
+    gen7_fill_3DSTATE_SF_body(cmd, &dw[1]);
+
+    cmd_batch_reserve(cmd, cmd_len);
+    cmd_batch_write_n(cmd, dw, cmd_len);
+}
+
+static void gen7_3DSTATE_SBE(struct intel_cmd *cmd)
+{
+    const uint8_t cmd_len = 14;
+    uint32_t dw[14];
+
+    CMD_ASSERT(cmd, 7, 7.5);
+
+    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) |
+            (cmd_len - 2);
+    gen7_fill_3DSTATE_SBE_body(cmd, &dw[1]);
+
+    cmd_batch_reserve(cmd, cmd_len);
+    cmd_batch_write_n(cmd, dw, cmd_len);
+}
+
 static void gen6_3DSTATE_WM(struct intel_cmd *cmd)
 {
     const int max_threads = (cmd->dev->gpu->gt == 2) ? 80 : 40;
@@ -1284,6 +1461,8 @@
         gen7_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_PS,
                 &cmd->bind.pipeline.graphics->fs);
 
+        gen7_3DSTATE_SF(cmd);
+        gen7_3DSTATE_SBE(cmd);
         gen7_3DSTATE_WM(cmd);
         gen7_3DSTATE_PS(cmd);
     } else {
@@ -1295,6 +1474,7 @@
         gen6_pcb(cmd, GEN6_RENDER_OPCODE_3DSTATE_CONSTANT_PS,
                 &cmd->bind.pipeline.graphics->fs);
 
+        gen6_3DSTATE_SF(cmd);
         gen6_3DSTATE_WM(cmd);
     }