radeonsi: expand FMASK before MSAA image stores are used

Image stores don't use FMASK, so we have to turn it into identity.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index fbf83a0..0cf0cd9 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -665,6 +665,75 @@
 	return ctx->create_compute_state(ctx, &state);
 }
 
+/* Load samples from the image, and copy them to the same image. This looks like
+ * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are
+ * reordered to match expanded FMASK.
+ *
+ * After the shader finishes, FMASK should be cleared to identity.
+ */
+void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples,
+				bool is_array)
+{
+	enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA :
+						   TGSI_TEXTURE_2D_MSAA;
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+	if (!ureg)
+		return NULL;
+
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+	/* Compute the image coordinates. */
+	struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false);
+	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+	struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg),
+					       TGSI_WRITEMASK_XYZ);
+	ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY),
+		  ureg_swizzle(blk, 0, 1, 1, 1), ureg_imm2u(ureg, 8, 8),
+		  ureg_swizzle(tid, 0, 1, 1, 1));
+	if (is_array) {
+		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z),
+			 ureg_scalar(blk, TGSI_SWIZZLE_Z));
+	}
+
+	/* Load samples, resolving FMASK. */
+	struct ureg_dst sample[8];
+	assert(num_samples <= ARRAY_SIZE(sample));
+
+	for (unsigned i = 0; i < num_samples; i++) {
+		sample[i] = ureg_DECL_temporary(ureg);
+
+		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
+			 ureg_imm1u(ureg, i));
+
+		struct ureg_src srcs[] = {image, ureg_src(coord)};
+		ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2,
+				 TGSI_MEMORY_RESTRICT, target, 0);
+	}
+
+	/* Store samples, ignoring FMASK. */
+	for (unsigned i = 0; i < num_samples; i++) {
+		ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W),
+			 ureg_imm1u(ureg, i));
+
+		struct ureg_dst dst_image = ureg_dst(image);
+		struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])};
+		ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2,
+				 TGSI_MEMORY_RESTRICT, target, 0);
+	}
+	ureg_END(ureg);
+
+	struct pipe_compute_state state = {};
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = ureg_get_tokens(ureg, NULL);
+
+	void *cs = ctx->create_compute_state(ctx, &state);
+	ureg_destroy(ureg);
+	return cs;
+}
+
 /* Create the compute shader that is used to collect the results of gfx10+
  * shader queries.
  *