radv: use nir_opt_find_array_copies()

Totals from affected shaders:
SGPRS: 1112 -> 1112 (0.00 %)
VGPRS: 1492 -> 1196 (-19.84 %)
Spilled SGPRs: 0 -> 0 (0.00 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 112172 -> 101316 (-9.68 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 93 -> 98 (5.38 %)
Wait states: 0 -> 0 (0.00 %)

All affected shaders are from "Batman: Arkham City" over DXVK.

The pass detects that the temporary array created by DXVK for
storing TCS inputs is a copy of the input arrays and allows
us to avoid copying all of the input data and then indirecting
on it with if-ladders, instead we just do indirect indexing.

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 3b3422c..52aa83d 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -118,7 +118,8 @@
 }
 
 void
-radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively)
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively,
+                  bool allow_copies)
 {
         bool progress;
 
@@ -128,6 +129,15 @@
                 NIR_PASS_V(shader, nir_lower_vars_to_ssa);
 		NIR_PASS_V(shader, nir_lower_pack);
 
+		if (allow_copies) {
+			/* Only run this pass in the first call to
+			 * radv_optimize_nir.  Later calls assume that we've
+			 * lowered away any copy_deref instructions and we
+			 *  don't want to introduce any more.
+			*/
+			NIR_PASS(progress, shader, nir_opt_find_array_copies);
+		}
+
 		NIR_PASS(progress, shader, nir_opt_copy_prop_vars);
 		NIR_PASS(progress, shader, nir_opt_dead_write_vars);
 
@@ -306,7 +316,6 @@
 	}
 
 	nir_split_var_copies(nir);
-	nir_lower_var_copies(nir);
 
 	nir_lower_global_vars_to_local(nir);
 	nir_remove_dead_variables(nir, nir_var_local);
@@ -323,7 +332,12 @@
 	nir_lower_load_const_to_scalar(nir);
 
 	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
-		radv_optimize_nir(nir, false);
+		radv_optimize_nir(nir, false, true);
+
+	/* We call nir_lower_var_copies() after the first radv_optimize_nir()
+	 * to remove any copies introduced by nir_opt_find_array_copies().
+	 */
+	nir_lower_var_copies(nir);
 
 	/* Indirect lowering must be called after the radv_optimize_nir() loop
 	 * has been called at least once. Otherwise indirect lowering can
@@ -331,7 +345,7 @@
 	 * considered too large for unrolling.
 	 */
 	ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class);
-	radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT);
+	radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT, false);
 
 	return nir;
 }