radeonsi: always use Wave32 for GS fast launch, because Wave64 hangs
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5524>
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f56762a..9ad14ca 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1880,12 +1880,14 @@
static inline unsigned si_get_wave_size(struct si_screen *sscreen,
enum pipe_shader_type shader_type, bool ngg, bool es,
- bool prim_discard_cs)
+ bool gs_fast_launch, bool prim_discard_cs)
{
if (shader_type == PIPE_SHADER_COMPUTE)
return sscreen->compute_wave_size;
else if (shader_type == PIPE_SHADER_FRAGMENT)
return sscreen->ps_wave_size;
+ else if (gs_fast_launch)
+ return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */
else if ((shader_type == PIPE_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */
(shader_type == PIPE_SHADER_VERTEX && es && !ngg) ||
(shader_type == PIPE_SHADER_TESS_EVAL && es && !ngg) ||
@@ -1898,7 +1900,9 @@
static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
{
return si_get_wave_size(shader->selector->screen, shader->selector->type, shader->key.as_ngg,
- shader->key.as_es, shader->key.opt.vs_as_prim_discard_cs);
+ shader->key.as_es,
+ shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
+ shader->key.opt.vs_as_prim_discard_cs);
}
#define PRINT_ERR(fmt, args...) \
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cec837d..60ff838 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1967,6 +1967,9 @@
shader.key.as_ls = key->vs_prolog.as_ls;
shader.key.as_es = key->vs_prolog.as_es;
shader.key.as_ngg = key->vs_prolog.as_ngg;
+ shader.key.opt.ngg_culling =
+ (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) |
+ (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0);
shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs;
break;
case PIPE_SHADER_TESS_CTRL:
@@ -1990,6 +1993,7 @@
struct si_shader_context ctx;
si_llvm_context_init(&ctx, sscreen, compiler,
si_get_wave_size(sscreen, type, shader.key.as_ngg, shader.key.as_es,
+ shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL,
shader.key.opt.vs_as_prim_discard_cs));
ctx.shader = &shader;
ctx.type = type;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 2a60957..fc14b64 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -474,7 +474,8 @@
shader->is_gs_copy_shader = true;
si_llvm_context_init(&ctx, sscreen, compiler,
- si_get_wave_size(sscreen, PIPE_SHADER_VERTEX, false, false, false));
+ si_get_wave_size(sscreen, PIPE_SHADER_VERTEX,
+ false, false, false, false));
ctx.shader = shader;
ctx.type = PIPE_SHADER_VERTEX;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 5945a47..b4e95b7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -69,7 +69,7 @@
shader_variant_flags |= 1 << 0;
if (sel->nir)
shader_variant_flags |= 1 << 1;
- if (si_get_wave_size(sel->screen, sel->type, ngg, es, false) == 32)
+ if (si_get_wave_size(sel->screen, sel->type, ngg, es, false, false) == 32)
shader_variant_flags |= 1 << 2;
if (sel->type == PIPE_SHADER_FRAGMENT && sel->info.uses_derivatives && sel->info.uses_kill &&
sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL))
@@ -1120,11 +1120,13 @@
else
gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+ unsigned wave_size = si_get_shader_wave_size(shader);
+
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
si_pm4_set_reg(
pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
- S_00B228_VGPRS((shader->config.num_vgprs - 1) / (sscreen->ge_wave_size == 32 ? 8 : 4)) |
+ S_00B228_VGPRS((shader->config.num_vgprs - 1) / (wave_size == 32 ? 8 : 4)) |
S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_DX10_CLAMP(1) |
S_00B228_MEM_ORDERED(1) | S_00B228_WGP_MODE(1) |
S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt));
@@ -3692,7 +3694,9 @@
if (screen->info.chip_class >= GFX9)
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
- if (screen->info.chip_class >= GFX10 && screen->ge_wave_size == 32) {
+ if (screen->info.chip_class >= GFX10 &&
+ /* GS fast launch hangs with Wave64, so always use Wave32. */
+ (screen->ge_wave_size == 32 || (key.u.ngg && key.u.ngg_gs_fast_launch))) {
stages |= S_028B54_HS_W32_EN(1) |
S_028B54_GS_W32_EN(key.u.ngg) | /* legacy GS only supports Wave64 */
S_028B54_VS_W32_EN(1);