radeonsi: add a thorough clear/copy_buffer benchmark
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index 45bc93e..da55c81 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -119,6 +119,111 @@
 	return ureg_create_shader_and_destroy(ureg, &sctx->b);
 }
 
+/* Create a compute shader implementing clear_buffer or copy_buffer. */
+void *si_create_dma_compute_shader(struct pipe_context *ctx,
+				   unsigned num_dwords_per_thread,
+				   bool dst_stream_cache_policy, bool is_copy)
+{
+	assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+
+	unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT;
+	if (dst_stream_cache_policy)
+		store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	/* Don't cache loads, because there is no reuse. */
+	unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY;
+
+	unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
+	unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+
+	for (unsigned i = 0; i < num_mem_ops; i++) {
+		if (i*4 < num_dwords_per_thread)
+			inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4);
+	}
+
+	struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE);
+	if (!ureg)
+		return NULL;
+
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1);
+	ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1);
+
+	struct ureg_src value;
+	if (!is_copy) {
+		ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_DWORDS, inst_dwords[0]);
+		value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA, 0);
+	}
+
+	struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0);
+	struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0);
+	struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X);
+	struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false));
+	struct ureg_src srcbuf;
+	struct ureg_src *values = NULL;
+
+	if (is_copy) {
+		srcbuf = ureg_DECL_buffer(ureg, 1, false);
+		values = malloc(num_mem_ops * sizeof(struct ureg_src));
+	}
+
+	/* If there are multiple stores, the first store writes into 0+tid,
+	 * the 2nd store writes into 64+tid, the 3rd store writes into 128+tid, etc.
+	 */
+	ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, 64 * num_mem_ops), tid);
+	/* Convert from a "store size unit" into bytes. */
+	ureg_UMUL(ureg, store_addr, ureg_src(store_addr),
+		  ureg_imm1u(ureg, 4 * inst_dwords[0]));
+	ureg_MOV(ureg, load_addr, ureg_src(store_addr));
+
+	/* Distance between a load and a store for latency hiding. */
+	unsigned load_store_distance = is_copy ? 8 : 0;
+
+	for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
+		int d = i - load_store_distance;
+
+		if (is_copy && i < num_mem_ops) {
+			if (i) {
+				ureg_UADD(ureg, load_addr, ureg_src(load_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[i] * 64));
+			}
+
+			values[i] = ureg_src(ureg_DECL_temporary(ureg));
+			struct ureg_dst dst =
+				ureg_writemask(ureg_dst(values[i]),
+					       u_bit_consecutive(0, inst_dwords[i]));
+			struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)};
+			ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2,
+					 load_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+
+		if (d >= 0) {
+			if (d) {
+				ureg_UADD(ureg, store_addr, ureg_src(store_addr),
+					  ureg_imm1u(ureg, 4 * inst_dwords[d] * 64));
+			}
+
+			struct ureg_dst dst =
+				ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d]));
+			struct ureg_src srcs[] =
+				{ureg_src(store_addr), is_copy ? values[d] : value};
+			ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2,
+					 store_qualifier, TGSI_TEXTURE_BUFFER, 0);
+		}
+	}
+	ureg_END(ureg);
+
+	struct pipe_compute_state state = {};
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = ureg_get_tokens(ureg, NULL);
+
+	void *cs = ctx->create_compute_state(ctx, &state);
+	ureg_destroy(ureg);
+	free(values);
+	return cs;
+}
+
 /* Create the compute shader that is used to collect the results.
  *
  * One compute grid with a single thread is launched for every query result