radeonsi: implement and use compute-based DCC decompression on gfx9-10

DCC_DECOMPRESS doesn't work. Instead of trying to figure out why,
use a compute blit where the load is compressed and the store is
uncompressed.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4761>
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index e5fd089..d1a97c2 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -573,6 +573,45 @@
    return ctx->create_compute_state(ctx, &state);
 }
 
+/* Create a compute shader implementing DCC decompression via a blit.
+ * This is a trivial copy_image shader except that it has a variable block
+ * size and a barrier.
+ */
+void *si_create_dcc_decompress_cs(struct pipe_context *ctx)
+{
+   static const char text[] =
+      "COMP\n"
+      "DCL SV[0], THREAD_ID\n"
+      "DCL SV[1], BLOCK_ID\n"
+      "DCL SV[2], BLOCK_SIZE\n"
+      "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+      "DCL TEMP[0..1]\n"
+
+      "UMAD TEMP[0].xyz, SV[1].xyzz, SV[2].xyzz, SV[0].xyzz\n"
+      "LOAD TEMP[1], IMAGE[0], TEMP[0].xyzz, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      /* Wait for the whole threadgroup (= DCC block) to load texels before
+       * overwriting them, because overwriting any pixel within a DCC block
+       * can break compression for the whole block.
+       */
+      "BARRIER\n"
+      "STORE IMAGE[1], TEMP[0].xyzz, TEMP[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+      "END\n";
+
+   struct tgsi_token tokens[1024];
+   struct pipe_compute_state state = {0};
+
+   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+      assert(false);
+      return NULL;
+   }
+
+   state.ir_type = PIPE_SHADER_IR_TGSI;
+   state.prog = tokens;
+
+   return ctx->create_compute_state(ctx, &state);
+}
+
 void *si_clear_render_target_shader(struct pipe_context *ctx)
 {
    static const char text[] =