freedreno/a6xx: Implement user clip/cull distances

Also, plumb things through ir3 so that we don't lower clip planes to
discard anymore.

This seems to fix some artifacts in the neverball trace.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6959>
diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt
index f4ad6b7..19519c5 100644
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -13,7 +13,6 @@
 dEQP-VK.tessellation.invariance.outer_edge_index_independence.quads_fractional_even_spacing_ccw_point_mode
 dEQP-VK.tessellation.invariance.outer_edge_symmetry.triangles_fractional_odd_spacing_cw_point_mode
 
-KHR-GL30.clip_distance.functional
 KHR-GL30.transform_feedback.api_errors_test
 KHR-GL30.transform_feedback.capture_vertex_interleaved_test
 KHR-GL30.transform_feedback.capture_vertex_separate_test
diff --git a/.gitlab-ci/traces-freedreno.yml b/.gitlab-ci/traces-freedreno.yml
index 2bc3281..8d47fcd 100644
--- a/.gitlab-ci/traces-freedreno.yml
+++ b/.gitlab-ci/traces-freedreno.yml
@@ -236,7 +236,7 @@
   - path: neverball/neverball.trace
     expectations:
       - device: freedreno-a630
-        checksum: e67cdf15590f1729201eb82393f5513e
+        checksum: 3e0a972c2a2180b349cb1c529d3ceca5
   - path: pathfinder/canvas_moire.trace
     expectations:
       - device: freedreno-a630
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 4bc246b..9080ed7 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -94,6 +94,9 @@
 		 */
 		compiler->max_const_compute = 256;
 
+		/* TODO: implement clip+cull distances on earlier gen's */
+		compiler->has_clip_cull = true;
+
 		if (compiler->gpu_id == 650)
 			compiler->tess_use_shared = true;
 	} else {
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 5d7d140..0c9a2a4 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -105,6 +105,9 @@
 	 * vec4 units):
 	 */
 	uint32_t const_upload_unit;
+
+	/* Whether clip+cull distances are supported */
+	bool has_clip_cull;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 1ee2956..dfb5f29 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -460,7 +460,7 @@
 		bool layer_zero = so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);
 		bool view_zero = so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);
 
-		if (so->key.ucp_enables)
+		if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)
 			progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);
 		if (so->key.fclamp_color)
 			progress |= OPT(s, nir_lower_clamp_color_outputs);
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index b29a66a..bc5cb51 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -338,7 +338,12 @@
 
 	key->safe_constlen = true;
 
-	key->ucp_enables = 0xff;
+	/* When clip/cull distances are natively supported, we only use
+	 * ucp_enables to determine whether to lower legacy clip planes to
+	 * gl_ClipDistance.
+	 */
+	if (info->stage != MESA_SHADER_FRAGMENT || !shader->compiler->has_clip_cull)
+		key->ucp_enables = 0xff;
 
 	if (info->stage == MESA_SHADER_FRAGMENT) {
 		key->fsaturate_s = ~0;
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
index 5b6c2ca..406ad0b 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
@@ -307,6 +307,7 @@
 		bool binning_pass)
 {
 	uint32_t pos_regid, psize_regid, color_regid[8], posz_regid;
+	uint32_t clip0_regid, clip1_regid;
 	uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
 	uint32_t smask_in_regid, smask_regid;
 	uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid;
@@ -316,6 +317,7 @@
 	uint32_t gs_header_regid;
 	enum a3xx_threadsize fssz;
 	uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0;
+	uint8_t clip0_loc, clip1_loc;
 	int i, j;
 
 	static const struct ir3_shader_variant dummy_fs = {0};
@@ -337,6 +339,8 @@
 
 	pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS);
 	psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ);
+	clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0);
+	clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1);
 	vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
 	instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
 
@@ -349,6 +353,8 @@
 
 		pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS);
 		psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ);
+		clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0);
+		clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1);
 	} else {
 		tess_coord_x_regid = regid(63, 0);
 		tess_coord_y_regid = regid(63, 0);
@@ -362,6 +368,8 @@
 		primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
 		pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS);
 		psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ);
+		clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0);
+		clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1);
 		layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER);
 	} else {
 		gs_header_regid = regid(63, 0);
@@ -464,6 +472,8 @@
 	const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
 
 	bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0);
+	uint8_t clip_mask = last_shader->clip_mask, cull_mask = last_shader->cull_mask;
+	uint8_t clip_cull_mask = clip_mask | cull_mask;
 
 	/* If we have streamout, link against the real FS, rather than the
 	 * dummy FS used for binning pass state, to ensure the OUTLOC's
@@ -475,6 +485,8 @@
 	ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true);
 
 	bool primid_passthru = l.primid_loc != 0xff;
+	clip0_loc = l.clip0_loc;
+	clip1_loc = l.clip1_loc;
 
 	OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
 	OUT_RING(ring, ~l.varmask[0]);  /* VPC_VAR[0].DISABLE */
@@ -500,6 +512,20 @@
 		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
 	}
 
+	/* Handle the case where clip/cull distances aren't read by the FS. Make
+	 * sure to avoid adding an output with an empty writemask if the user
+	 * disables all the clip distances in the API so that the slot is unused.
+	 */
+	if (clip0_loc == 0xff && VALIDREG(clip0_regid) && (clip_cull_mask & 0xf) != 0) {
+		clip0_loc = l.max_loc;
+		ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc);
+	}
+
+	if (clip1_loc == 0xff && VALIDREG(clip1_regid) && (clip_cull_mask >> 4) != 0) {
+		clip1_loc = l.max_loc;
+		ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc);
+	}
+
 	/* If we have stream-out, we use the full shader for binning
 	 * pass, rather than the optimized binning pass one, so that we
 	 * have all the varying outputs available for xfb.  So streamout
@@ -602,7 +628,9 @@
 				A6XX_PC_TESS_CNTL_OUTPUT(output));
 
 		OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1);
-		OUT_RING(ring, 0x00ffff00);
+		OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+				       A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+					   A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
 
 		OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1);
 		OUT_RING(ring, 0x0000ffff);
@@ -611,7 +639,8 @@
 		OUT_RING(ring, 0x0);
 
 		OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1);
-		OUT_RING(ring, 0x0);
+		OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) |
+				       A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask));
 
 		OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1);
 		OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) |
@@ -628,7 +657,8 @@
 
 		OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1);
 		OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
-				CONDREG(psize_regid, 0x100));
+				CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) |
+				A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
 
 	} else {
 		OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
@@ -648,7 +678,8 @@
 
 	OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1);
 	OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
-			CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE));
+			CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
+			A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
 
 	OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1);
 	OUT_RING(ring, 0);
@@ -785,7 +816,8 @@
 		OUT_RING(ring, A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
 				CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) |
 				CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) |
-				CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID));
+				CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
+				A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
 
 		uint32_t output;
 		switch (gs->shader->nir->info.gs.output_primitive) {
@@ -808,13 +840,16 @@
 				A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(gs->shader->nir->info.gs.invocations - 1));
 
 		OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1);
-		OUT_RING(ring, 0);
+		OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) |
+				       A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask));
 
 		OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1);
 		OUT_RING(ring, 0xff);
 
 		OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1);
-		OUT_RING(ring, 0xffff00);
+		OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+				       A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+					   A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
 
 		const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs;
 
@@ -838,7 +873,13 @@
 	}
 
 	OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1);
-	OUT_RING(ring, 0xffff00);
+	OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+				   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+				   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
+
+	OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1);
+	OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
+				   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
 
 	OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1);
 	OUT_RING(ring, 0);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
index 1f6bf70..3532477 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
@@ -58,8 +58,7 @@
 			.unk5 = !cso->depth_clip_near || !cso->depth_clip_far,
 			.vp_clip_code_ignore = 1,
 			.zero_gb_scale_z = cso->clip_halfz
-			),
-		A6XX_GRAS_VS_CL_CNTL());
+			));
 
 	OUT_REG(ring,
 		A6XX_GRAS_SU_CNTL(
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index bea2399..08dbcf5 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -461,6 +461,8 @@
 		return fd_device_version(screen->dev) >= FD_VERSION_FENCE_FD;
 	case PIPE_CAP_FENCE_SIGNAL:
 		return screen->has_syncobj;
+	case PIPE_CAP_CULL_DISTANCE:
+		return is_a6xx(screen);
 	default:
 		return u_pipe_screen_get_param_defaults(pscreen, param);
 	}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 810c3b2..cb28ed5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -188,6 +188,7 @@
 	 */
 	struct ir3_shader_key key = {
 		.tessellation = IR3_TESS_NONE,
+		.ucp_enables = MASK(nir->info.clip_distance_array_size),
 		.msaa = true,
 	};