radeonsi: implement multi_draw but supporting only 1 draw

just adapting to the new interface

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7056>
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index ab236e2..d58f7b0 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -955,7 +955,8 @@
 
 enum si_prim_discard_outcome
 si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      bool primitive_restart)
+                                      const struct pipe_draw_start_count *draws,
+                                      unsigned num_draws, bool primitive_restart)
 {
    /* If the compute shader compilation isn't finished, this returns false. */
    if (!si_shader_select_prim_discard_cs(sctx, info, primitive_restart))
@@ -966,7 +967,7 @@
 
    struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
    unsigned prim = info->mode;
-   unsigned count = info->count;
+   unsigned count = draws[0].count;
    unsigned instance_count = info->instance_count;
    unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
    unsigned num_prims = num_prims_per_instance * instance_count;
@@ -982,19 +983,21 @@
        (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
       /* Split draws. */
       struct pipe_draw_info split_draw = *info;
+      struct pipe_draw_start_count split_draw_range = draws[0];
+
       split_draw.primitive_restart = primitive_restart;
 
-      unsigned base_start = split_draw.start;
+      unsigned base_start = split_draw_range.start;
 
       if (prim == PIPE_PRIM_TRIANGLES) {
          unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
          assert(vert_count_per_subdraw < count);
 
          for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
-            split_draw.start = base_start + start;
-            split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+            split_draw_range.start = base_start + start;
+            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw);
 
-            sctx->b.draw_vbo(&sctx->b, &split_draw);
+            sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1);
          }
       } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
          /* No primitive pair can be split, because strips reverse orientation
@@ -1004,10 +1007,10 @@
          unsigned vert_count_per_subdraw = split_prims_draw_level;
 
          for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
-            split_draw.start = base_start + start;
-            split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+            split_draw_range.start = base_start + start;
+            split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2);
 
-            sctx->b.draw_vbo(&sctx->b, &split_draw);
+            sctx->b.multi_draw(&sctx->b, &split_draw, &split_draw_range, 1);
 
             if (start == 0 && primitive_restart &&
                 sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
@@ -1098,13 +1101,14 @@
 
 /* Dispatch a primitive discard compute shader. */
 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          const struct pipe_draw_info *info,
+                                          unsigned count, unsigned index_size,
                                           unsigned base_vertex, uint64_t input_indexbuf_va,
                                           unsigned input_indexbuf_num_elements)
 {
    struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
    struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
-   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+   unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, count);
    if (!num_prims_per_instance)
       return;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 68dbadc..ce906f5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1456,10 +1456,12 @@
 void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
 enum si_prim_discard_outcome
 si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info,
-                                      bool primitive_restart);
+                                      const struct pipe_draw_start_count *draws,
+                                      unsigned num_draws, bool primitive_restart);
 void si_compute_signal_gfx(struct si_context *sctx);
 void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
-                                          const struct pipe_draw_info *info, unsigned index_size,
+                                          const struct pipe_draw_info *info,
+                                          unsigned count, unsigned index_size,
                                           unsigned base_vertex, uint64_t input_indexbuf_va,
                                           unsigned input_indexbuf_max_elements);
 void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2ee204b..95945b7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -760,6 +760,8 @@
 }
 
 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
+                                 const struct pipe_draw_start_count *draws,
+                                 unsigned num_draws,
                                  struct pipe_resource *indexbuf, unsigned index_size,
                                  unsigned index_offset, unsigned instance_count,
                                  bool dispatch_prim_discard_cs, unsigned original_index_size)
@@ -838,6 +840,7 @@
    }
 
    if (indirect) {
+      assert(num_draws == 1);
       uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
 
       assert(indirect_va % 8 == 0);
@@ -912,7 +915,7 @@
       }
 
       /* Base vertex and start instance. */
-      base_vertex = original_index_size ? info->index_bias : info->start;
+      base_vertex = original_index_size ? info->index_bias : draws[0].start;
 
       if (sctx->num_vs_blit_sgprs) {
          /* Re-emit draw constants after we leave u_blitter. */
@@ -938,25 +941,26 @@
 
       if (index_size) {
          if (dispatch_prim_discard_cs) {
-            index_va += info->start * original_index_size;
-            index_max_size = MIN2(index_max_size, info->count);
+            index_va += draws[0].start * original_index_size;
+            index_max_size = MIN2(index_max_size, draws[0].count);
 
-            si_dispatch_prim_discard_cs_and_draw(sctx, info, original_index_size, base_vertex,
+            si_dispatch_prim_discard_cs_and_draw(sctx, info, draws[0].count,
+                                                 original_index_size, base_vertex,
                                                  index_va, index_max_size);
             return;
          }
 
-         index_va += info->start * index_size;
+         index_va += draws[0].start * index_size;
 
          radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
          radeon_emit(cs, index_max_size);
          radeon_emit(cs, index_va);
          radeon_emit(cs, index_va >> 32);
-         radeon_emit(cs, info->count);
+         radeon_emit(cs, draws[0].count);
          radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
       } else {
          radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
-         radeon_emit(cs, info->count);
+         radeon_emit(cs, draws[0].count);
          radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
                             S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
       }
@@ -1537,7 +1541,8 @@
 }
 
 static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_draw_info *info,
-                                    unsigned *start, unsigned *count)
+                                    const struct pipe_draw_start_count *draws,
+                                    unsigned num_draws, unsigned *start, unsigned *count)
 {
    struct pipe_draw_indirect_info *indirect = info->indirect;
 
@@ -1593,8 +1598,16 @@
          *start = *count = 0;
       }
    } else {
-      *start = info->start;
-      *count = info->count;
+      unsigned min_element = UINT_MAX;
+      unsigned max_element = 0;
+
+      for (unsigned i = 0; i < num_draws; i++) {
+         min_element = MIN2(min_element, draws[i].start);
+         max_element = MAX2(max_element, draws[i].start + draws[i].count);
+      }
+
+      *start = min_element;
+      *count = max_element;
    }
 }
 
@@ -1724,7 +1737,10 @@
    return false;
 }
 
-static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
+static void si_multi_draw_vbo(struct pipe_context *ctx,
+                              const struct pipe_draw_info *info,
+                              const struct pipe_draw_start_count *draws,
+                              unsigned num_draws)
 {
    struct si_context *sctx = (struct si_context *)ctx;
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
@@ -1732,7 +1748,7 @@
    unsigned dirty_tex_counter, dirty_buf_counter;
    enum pipe_prim_type rast_prim, prim = info->mode;
    unsigned index_size = info->index_size;
-   unsigned index_offset = info->indirect ? info->start * index_size : 0;
+   unsigned index_offset = info->indirect ? draws[0].start * index_size : 0;
    unsigned instance_count = info->instance_count;
    bool primitive_restart =
       info->primitive_restart &&
@@ -1841,7 +1857,7 @@
          unsigned start, count, start_offset, size, offset;
          void *ptr;
 
-         si_get_draw_start_count(sctx, info, &start, &count);
+         si_get_draw_start_count(sctx, info, draws, num_draws, &start, &count);
          start_offset = start * 2;
          size = count * 2;
 
@@ -1860,10 +1876,11 @@
          unsigned start_offset;
 
          assert(!info->indirect);
-         start_offset = info->start * index_size;
+         assert(num_draws == 1);
+         start_offset = draws[0].start * index_size;
 
          indexbuf = NULL;
-         u_upload_data(ctx->stream_uploader, start_offset, info->count * index_size,
+         u_upload_data(ctx->stream_uploader, start_offset, draws[0].count * index_size,
                        sctx->screen->info.tcc_cache_line_size,
                        (char *)info->index.user + start_offset, &index_offset, &indexbuf);
          if (unlikely(!indexbuf))
@@ -1882,7 +1899,9 @@
    bool dispatch_prim_discard_cs = false;
    bool prim_discard_cs_instancing = false;
    unsigned original_index_size = index_size;
-   unsigned direct_count = 0;
+   unsigned avg_direct_count = 0;
+   unsigned min_direct_count = 0;
+   unsigned total_direct_count = 0;
 
    if (info->indirect) {
       struct pipe_draw_indirect_info *indirect = info->indirect;
@@ -1904,17 +1923,21 @@
          }
       }
    } else {
-      /* Multiply by 3 for strips and fans to get an approximate vertex
-       * count as triangles. */
-      direct_count = info->count * instance_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
+      for (unsigned i = 0; i < num_draws; i++) {
+         unsigned count = draws[i].count;
+
+         total_direct_count += count;
+         min_direct_count = MIN2(min_direct_count, count);
+      }
+      avg_direct_count = (total_direct_count / num_draws) * instance_count;
    }
 
    /* Determine if we can use the primitive discard compute shader. */
    if (si_compute_prim_discard_enabled(sctx) &&
-       (direct_count > sctx->prim_discard_vertex_count_threshold
-           ? (sctx->compute_num_verts_rejected += direct_count, true)
+       (avg_direct_count > sctx->prim_discard_vertex_count_threshold
+           ? (sctx->compute_num_verts_rejected += total_direct_count, true)
            : /* Add, then return true. */
-           (sctx->compute_num_verts_ineligible += direct_count,
+           (sctx->compute_num_verts_ineligible += total_direct_count,
             false)) && /* Add, then return false. */
        (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
        (primitive_restart ?
@@ -1958,7 +1981,8 @@
         * dispatches can run ahead. */
        (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) ||
         pd_msg("write reference"))) {
-      switch (si_prepare_prim_discard_or_split_draw(sctx, info, primitive_restart)) {
+      switch (si_prepare_prim_discard_or_split_draw(sctx, info, draws, num_draws,
+                                                    primitive_restart)) {
       case SI_PRIM_DISCARD_ENABLED:
          original_index_size = index_size;
          prim_discard_cs_instancing = instance_count > 1;
@@ -1969,13 +1993,13 @@
          index_size = 4;
          instance_count = 1;
          primitive_restart = false;
-         sctx->compute_num_verts_rejected -= direct_count;
-         sctx->compute_num_verts_accepted += direct_count;
+         sctx->compute_num_verts_rejected -= total_direct_count;
+         sctx->compute_num_verts_accepted += total_direct_count;
          break;
       case SI_PRIM_DISCARD_DISABLED:
          break;
       case SI_PRIM_DISCARD_DRAW_SPLIT:
-         sctx->compute_num_verts_rejected -= direct_count;
+         sctx->compute_num_verts_rejected -= total_direct_count;
          goto return_cleanup;
       }
    }
@@ -1989,9 +2013,9 @@
    struct si_shader_selector *hw_vs;
    if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
        (hw_vs = si_get_vs(sctx)->cso) &&
-       (direct_count > hw_vs->ngg_cull_vert_threshold ||
+       (avg_direct_count > hw_vs->ngg_cull_vert_threshold ||
         (!index_size &&
-         direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
+         avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
          prim & ((1 << PIPE_PRIM_TRIANGLES) |
                  (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
       unsigned ngg_culling = 0;
@@ -2015,7 +2039,7 @@
       /* Use NGG fast launch for certain non-indexed primitive types.
        * A draw must have at least 1 full primitive.
        */
-      if (ngg_culling && !index_size && direct_count >= 3 && !sctx->tes_shader.cso &&
+      if (ngg_culling && !index_size && min_direct_count >= 3 && !sctx->tes_shader.cso &&
           !sctx->gs_shader.cso) {
          if (prim == PIPE_PRIM_TRIANGLES)
             ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
@@ -2048,7 +2072,7 @@
    if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx)))
       goto return_cleanup;
 
-   si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, num_draws);
 
    /* If we're using a secure context, determine if cs must be secure or not */
    if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
@@ -2101,7 +2125,7 @@
          masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
 
       /* Emit all states except possibly render condition. */
-      si_emit_all_states(sctx, info, prim, instance_count, info->count,
+      si_emit_all_states(sctx, info, prim, instance_count, min_direct_count,
                          primitive_restart, masked_atoms);
       sctx->emit_cache_flush(sctx);
       /* <-- CUs are idle here. */
@@ -2118,7 +2142,8 @@
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+      si_emit_draw_packets(sctx, info, draws, num_draws,
+                           indexbuf, index_size, index_offset, instance_count,
                            dispatch_prim_discard_cs, original_index_size);
       /* <-- CUs are busy here. */
 
@@ -2138,7 +2163,7 @@
       if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
          cik_emit_prefetch_L2(sctx, true);
 
-      si_emit_all_states(sctx, info, prim, instance_count, info->count,
+      si_emit_all_states(sctx, info, prim, instance_count, min_direct_count,
                          primitive_restart, masked_atoms);
 
       if (gfx9_scissor_bug &&
@@ -2148,7 +2173,8 @@
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset, instance_count,
+      si_emit_draw_packets(sctx, info, draws, num_draws,
+                           indexbuf, index_size, index_offset, instance_count,
                            dispatch_prim_discard_cs, original_index_size);
 
       /* Prefetch the remaining shaders after the draw has been
@@ -2189,6 +2215,14 @@
       pipe_resource_reference(&indexbuf, NULL);
 }
 
+static void si_draw_vbo(struct pipe_context *ctx,
+                        const struct pipe_draw_info *info)
+{
+   struct pipe_draw_start_count draw = {info->start, info->count};
+
+   si_multi_draw_vbo(ctx, info, &draw, 1);
+}
+
 static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
                               blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
                               float depth, unsigned num_instances, enum blitter_attrib_type type,
@@ -2245,6 +2279,7 @@
 void si_init_draw_functions(struct si_context *sctx)
 {
    sctx->b.draw_vbo = si_draw_vbo;
+   sctx->b.multi_draw = si_multi_draw_vbo;
 
    sctx->blitter->draw_rectangle = si_draw_rectangle;