intel: honor 64KB limit of BINDING_TABLE_STATEs

The higher 16 bits of binding table offset in 3DSTATE_BINDING_TABLE_POINTERS_x
must be 0.  BINDING_TABLE_STATEs must reside in the first 64KB of Surface
State Base Address as programmed by STATE_BASE_ADDRESS.

In this commit, we update Surface State Base Address when there is a chance to
exceed the limit.
diff --git a/icd/intel/cmd.c b/icd/intel/cmd.c
index 47f8629..7b23145 100644
--- a/icd/intel/cmd.c
+++ b/icd/intel/cmd.c
@@ -55,6 +55,8 @@
 
     writer->used = 0;
 
+    writer->sba_offset = 0;
+
     if (writer->items) {
         icd_free(writer->items);
         writer->items = NULL;
diff --git a/icd/intel/cmd.h b/icd/intel/cmd.h
index dcb1965..16281fa 100644
--- a/icd/intel/cmd.h
+++ b/icd/intel/cmd.h
@@ -203,6 +203,8 @@
 
     size_t used;
 
+    uint32_t sba_offset;
+
     /* for decoding */
     struct intel_cmd_item *items;
     uint32_t item_alloc;
diff --git a/icd/intel/cmd_pipeline.c b/icd/intel/cmd_pipeline.c
index 55947ad..97baed8 100644
--- a/icd/intel/cmd_pipeline.c
+++ b/icd/intel/cmd_pipeline.c
@@ -1200,9 +1200,12 @@
     dw[9] = 1;
 
     cmd_reserve_reloc(cmd, 3);
-    cmd_batch_reloc_writer(cmd, pos + 2, INTEL_CMD_WRITER_SURFACE, 1);
-    cmd_batch_reloc_writer(cmd, pos + 3, INTEL_CMD_WRITER_STATE, 1);
-    cmd_batch_reloc_writer(cmd, pos + 5, INTEL_CMD_WRITER_INSTRUCTION, 1);
+    cmd_batch_reloc_writer(cmd, pos + 2, INTEL_CMD_WRITER_SURFACE,
+            cmd->writers[INTEL_CMD_WRITER_SURFACE].sba_offset + 1);
+    cmd_batch_reloc_writer(cmd, pos + 3, INTEL_CMD_WRITER_STATE,
+            cmd->writers[INTEL_CMD_WRITER_STATE].sba_offset + 1);
+    cmd_batch_reloc_writer(cmd, pos + 5, INTEL_CMD_WRITER_INSTRUCTION,
+            cmd->writers[INTEL_CMD_WRITER_INSTRUCTION].sba_offset + 1);
 }
 
 void cmd_batch_flush(struct intel_cmd *cmd, uint32_t pipe_control_dw0)
@@ -1542,6 +1545,8 @@
                                    const struct intel_pipeline_rmap *rmap,
                                    const XGL_PIPELINE_SHADER_STAGE stage)
 {
+    const uint32_t sba_offset =
+        cmd->writers[INTEL_CMD_WRITER_SURFACE].sba_offset;
     uint32_t binding_table[256], offset;
     uint32_t surface_count, i;
 
@@ -1627,12 +1632,17 @@
                     null_view.cmd_len, null_view.cmd);
         }
 
-        binding_table[i] = offset;
+        binding_table[i] = offset - sba_offset;
     }
 
-    return cmd_surface_write(cmd, INTEL_CMD_ITEM_BINDING_TABLE,
+    offset = cmd_surface_write(cmd, INTEL_CMD_ITEM_BINDING_TABLE,
             GEN6_ALIGNMENT_BINDING_TABLE_STATE,
-            surface_count, binding_table);
+            surface_count, binding_table) - sba_offset;
+
+    /* there is a 64KB limit on BINIDNG_TABLE_STATEs */
+    assert(offset + sizeof(uint32_t) * surface_count <= 64 * 1024);
+
+    return offset;
 }
 
 static void gen6_3DSTATE_VERTEX_BUFFERS(struct intel_cmd *cmd)
@@ -2982,6 +2992,68 @@
     cmd->bind.state.blend = state;
 }
 
+static uint32_t cmd_get_max_surface_write(const struct intel_cmd *cmd)
+{
+    const struct intel_pipeline *pipeline = cmd->bind.pipeline.graphics;
+    struct intel_pipeline_rmap *rmaps[5] = {
+        pipeline->vs.rmap,
+        pipeline->tcs.rmap,
+        pipeline->tes.rmap,
+        pipeline->gs.rmap,
+        pipeline->fs.rmap,
+    };
+    uint32_t max_write;
+    int i;
+
+    STATIC_ASSERT(GEN6_ALIGNMENT_SURFACE_STATE >= GEN6_SURFACE_STATE__SIZE);
+    STATIC_ASSERT(GEN6_ALIGNMENT_SURFACE_STATE >=
+            GEN6_ALIGNMENT_BINDING_TABLE_STATE);
+
+    /* pad first */
+    max_write = GEN6_ALIGNMENT_SURFACE_STATE;
+
+    for (i = 0; i < ARRAY_SIZE(rmaps); i++) {
+        const struct intel_pipeline_rmap *rmap = rmaps[i];
+        const uint32_t surface_count = (rmap) ?
+            rmap->rt_count + rmap->texture_resource_count +
+            rmap->resource_count + rmap->uav_count : 0;
+
+        if (surface_count) {
+            /* SURFACE_STATEs */
+            max_write += GEN6_ALIGNMENT_SURFACE_STATE * surface_count;
+
+            /* BINDING_TABLE_STATE */
+            max_write += u_align(sizeof(uint32_t) * surface_count,
+                    GEN6_ALIGNMENT_SURFACE_STATE);
+        }
+    }
+
+    return max_write;
+}
+
+static void cmd_adjust_state_base_address(struct intel_cmd *cmd)
+{
+    struct intel_cmd_writer *writer = &cmd->writers[INTEL_CMD_WRITER_SURFACE];
+    const uint32_t cur_surface_offset = writer->used - writer->sba_offset;
+    uint32_t max_surface_write;
+
+    /* enough for src and dst SURFACE_STATEs plus BINDING_TABLE_STATE */
+    if (cmd->bind.meta)
+        max_surface_write = 64 * sizeof(uint32_t);
+    else
+        max_surface_write = cmd_get_max_surface_write(cmd);
+
+    /* there is a 64KB limit on BINDING_TABLE_STATEs */
+    if (cur_surface_offset + max_surface_write > 64 * 1024) {
+        /* SBA expects page-aligned addresses */
+        writer->sba_offset = writer->used & ~0xfff;
+
+        assert((writer->used & 0xfff) + max_surface_write <= 64 * 1024);
+
+        cmd_batch_state_base_address(cmd);
+    }
+}
+
 static void cmd_draw(struct intel_cmd *cmd,
                      uint32_t vertex_start,
                      uint32_t vertex_count,
@@ -2991,9 +3063,17 @@
                      uint32_t vertex_base)
 {
     const struct intel_pipeline *p = cmd->bind.pipeline.graphics;
+    const uint32_t surface_writer_used =
+        cmd->writers[INTEL_CMD_WRITER_SURFACE].used;
+
+    cmd_adjust_state_base_address(cmd);
 
     emit_bounded_states(cmd);
 
+    /* sanity check on cmd_get_max_surface_write() */
+    assert(cmd->writers[INTEL_CMD_WRITER_SURFACE].used -
+            surface_writer_used <= cmd_get_max_surface_write(cmd));
+
     if (indexed) {
         if (p->primitive_restart && !gen6_can_primitive_restart(cmd))
             cmd->result = XGL_ERROR_UNKNOWN;
@@ -3033,6 +3113,8 @@
 {
     cmd->bind.meta = meta;
 
+    cmd_adjust_state_base_address(cmd);
+
     cmd_wa_gen6_pre_depth_stall_write(cmd);
     cmd_wa_gen6_pre_command_scoreboard_stall(cmd);
 
diff --git a/icd/intel/cmd_priv.h b/icd/intel/cmd_priv.h
index 9349b10..bb21fcd 100644
--- a/icd/intel/cmd_priv.h
+++ b/icd/intel/cmd_priv.h
@@ -287,7 +287,7 @@
                                          const uint32_t *dw)
 {
     const enum intel_cmd_writer_type which = INTEL_CMD_WRITER_SURFACE;
-    const XGL_SIZE size = len << 2;
+    const size_t size = len << 2;
     const uint32_t offset = cmd_writer_reserve(cmd, which, alignment, size);
     struct intel_cmd_writer *writer = &cmd->writers[which];
     uint32_t *dst;