radeonsi: always use Wave32 for GS fast launch, because Wave64 hangs

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5524>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f56762a..9ad14ca 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1880,12 +1880,14 @@
 
 static inline unsigned si_get_wave_size(struct si_screen *sscreen,
                                         enum pipe_shader_type shader_type, bool ngg, bool es,
-                                        bool prim_discard_cs)
+                                        bool gs_fast_launch, bool prim_discard_cs)
 {
    if (shader_type == PIPE_SHADER_COMPUTE)
       return sscreen->compute_wave_size;
    else if (shader_type == PIPE_SHADER_FRAGMENT)
       return sscreen->ps_wave_size;
+   else if (gs_fast_launch)
+      return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
    else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
             (shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
             (shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
@@ -1898,7 +1900,9 @@
 static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
 {
    return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
-                           shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
+                           shader->key.as_es,
+                           shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
+                           shader->key.opt.vs_as_prim_discard_cs);
 }
 
 #define PRINT_ERR(fmt, args...)                                                                    \
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cec837d..60ff838 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1967,6 +1967,9 @@
       shader.key.as_ls = key->vs_prolog.as_ls;
       shader.key.as_es = key->vs_prolog.as_es;
       shader.key.as_ngg = key->vs_prolog.as_ngg;
+      shader.key.opt.ngg_culling =
+         (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
+         (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
       shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
       break;
    case PIPE_SHADER_TESS_CTRL:
@@ -1990,6 +1993,7 @@
    struct si_shader_context ctx;
    si_llvm_context_init(&ctx, sscreen, compiler,
                         si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+                                         shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
                                          shader.key.opt.vs_as_prim_discard_cs));
    ctx.shader = &shader;
    ctx.type = type;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 2a60957..fc14b64 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -474,7 +474,8 @@
    shader->is_gs_copy_shader = true;
 
    si_llvm_context_init(&ctx, sscreen, compiler,
-                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+                        si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
+                                         false, false, false, false));
    ctx.shader = shader;
    ctx.type = PIPE_SHADER_VERTEX;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 5945a47..b4e95b7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -69,7 +69,7 @@
       shader_variant_flags |= 1 << 0;
    if (sel->nir)
       shader_variant_flags |= 1 << 1;
-   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+   if (si_get_wave_size(sel->screen, sel->type, ngg, es, false, false) == 32)
       shader_variant_flags |= 1 << 2;
    if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
        sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
@@ -1120,11 +1120,13 @@
    else
       gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
 
+   unsigned wave_size = si_get_shader_wave_size(shader);
+
    si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
    si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
    si_pm4_set_reg(
       pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
-      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+      S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
          S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
          S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
          S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
@@ -3692,7 +3694,9 @@
    if (screen->info.chip_class >= GFX9)
       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
 
-   if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+   if (screen->info.chip_class >= GFX10 &&
+       /* GS fast launch hangs with Wave64, so always use Wave32. */
+       (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
       stages |= S_028B54_HS_W32_EN(1) |
                 S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
                 S_028B54_VS_W32_EN(1);