compiler: Start using SIMD16 fragment shaders The SIMD16 ISA has always been available, we just hadn't hooked it up yet in the driver. We've got enough workloads to know if it helps performance now, so flip the switch!

commit: e86574eeee5abde8705755afcd4e52b7daaf88ed [log] [tgz]
author: Cody Northrop <cody@lunarg.com> Tue Feb 24 14:15:29 2015 -0700
committer: Cody Northrop <cody@lunarg.com> Tue Feb 24 16:05:57 2015 -0700
tree: de1f538b5ca12811c1addd2673d6a725959016ae
parent: 6e68a893f0364c1157789d260cdf84365cb58cf7 [diff] [blame]
diff --git a/icd/intel/cmd_pipeline.c b/icd/intel/cmd_pipeline.c
index 1d6093c..6a5e6a4 100644
--- a/icd/intel/cmd_pipeline.c
+++ b/icd/intel/cmd_pipeline.c

@@ -556,7 +556,7 @@
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const uint8_t cmd_len = 9;
     uint32_t pos;
-    uint32_t dw0, dw2, dw4, dw5, dw6, *dw;
+    uint32_t dw0, dw2, dw4, dw5, dw6, dw8, *dw;
 
     CMD_ASSERT(cmd, 6, 6);
 
@@ -568,12 +568,15 @@
     dw4 = GEN6_WM_DW4_STATISTICS |
           fs->urb_grf_start << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
           0 << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
-          0 << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
+          fs->urb_grf_start_16 << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
 
     dw5 = (fs->max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT |
           GEN6_WM_DW5_PS_ENABLE |
           GEN6_WM_DW5_8_PIXEL_DISPATCH;
 
+    if (fs->offset_16)
+        dw5 |= GEN6_WM_DW5_16_PIXEL_DISPATCH;
+
     if (fs->uses & INTEL_SHADER_USE_KILL ||
         pipeline->cb_state.alphaToCoverageEnable)
         dw5 |= GEN6_WM_DW5_PS_KILL;
@@ -602,6 +605,8 @@
                GEN6_WM_DW6_MSDISPMODE_PERSAMPLE;
     }
 
+    dw8 = (fs->offset_16) ? cmd->bind.pipeline.fs_offset + fs->offset_16 : 0;
+
     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
     dw[1] = cmd->bind.pipeline.fs_offset;
@@ -611,7 +616,7 @@
     dw[5] = dw5;
     dw[6] = dw6;
     dw[7] = 0; /* kernel 1 */
-    dw[8] = 0; /* kernel 2 */
+    dw[8] = dw8; /* kernel 2 */
 
     if (fs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, fs);
@@ -666,7 +671,7 @@
     const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
     const struct intel_pipeline_shader *fs = &pipeline->fs;
     const uint8_t cmd_len = 8;
-    uint32_t dw0, dw2, dw4, dw5, *dw;
+    uint32_t dw0, dw2, dw4, dw5, dw7, *dw;
     uint32_t pos;
 
     CMD_ASSERT(cmd, 7, 7.5);
@@ -679,6 +684,9 @@
     dw4 = GEN7_PS_DW4_POSOFFSET_NONE |
           GEN7_PS_DW4_8_PIXEL_DISPATCH;
 
+    if (fs->offset_16)
+        dw4 |= GEN7_PS_DW4_16_PIXEL_DISPATCH;
+
     if (cmd_gen(cmd) >= INTEL_GEN(7.5)) {
         dw4 |= (fs->max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
         dw4 |= pipeline->cmd_sample_mask << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
@@ -694,7 +702,9 @@
 
     dw5 = fs->urb_grf_start << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
           0 << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
-          0 << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
+          fs->urb_grf_start_16 << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
+
+    dw7 = (fs->offset_16) ? cmd->bind.pipeline.fs_offset + fs->offset_16 : 0;
 
     pos = cmd_batch_pointer(cmd, cmd_len, &dw);
     dw[0] = dw0;
@@ -704,7 +714,7 @@
     dw[4] = dw4;
     dw[5] = dw5;
     dw[6] = 0; /* kernel 1 */
-    dw[7] = 0; /* kernel 2 */
+    dw[7] = dw7; /* kernel 2 */
 
     if (fs->per_thread_scratch_size)
         gen6_add_scratch_space(cmd, pos + 3, pipeline, fs);
commit	e86574eeee5abde8705755afcd4e52b7daaf88ed	[log] [tgz]
author	Cody Northrop <cody@lunarg.com>	Tue Feb 24 14:15:29 2015 -0700
committer	Cody Northrop <cody@lunarg.com>	Tue Feb 24 16:05:57 2015 -0700
tree	de1f538b5ca12811c1addd2673d6a725959016ae
parent	6e68a893f0364c1157789d260cdf84365cb58cf7 [diff] [blame]