freedreno/a6xx: Implement user clip/cull distances
Also, plumb things through ir3 so that we don't lower clip planes to
discard anymore.
This seems to fix some artifacts in the neverball trace.
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6959>
diff --git a/.gitlab-ci/deqp-freedreno-a630-fails.txt b/.gitlab-ci/deqp-freedreno-a630-fails.txt
index f4ad6b7..19519c5 100644
--- a/.gitlab-ci/deqp-freedreno-a630-fails.txt
+++ b/.gitlab-ci/deqp-freedreno-a630-fails.txt
@@ -13,7 +13,6 @@
dEQP-VK.tessellation.invariance.outer_edge_index_independence.quads_fractional_even_spacing_ccw_point_mode
dEQP-VK.tessellation.invariance.outer_edge_symmetry.triangles_fractional_odd_spacing_cw_point_mode
-KHR-GL30.clip_distance.functional
KHR-GL30.transform_feedback.api_errors_test
KHR-GL30.transform_feedback.capture_vertex_interleaved_test
KHR-GL30.transform_feedback.capture_vertex_separate_test
diff --git a/.gitlab-ci/traces-freedreno.yml b/.gitlab-ci/traces-freedreno.yml
index 2bc3281..8d47fcd 100644
--- a/.gitlab-ci/traces-freedreno.yml
+++ b/.gitlab-ci/traces-freedreno.yml
@@ -236,7 +236,7 @@
- path: neverball/neverball.trace
expectations:
- device: freedreno-a630
- checksum: e67cdf15590f1729201eb82393f5513e
+ checksum: 3e0a972c2a2180b349cb1c529d3ceca5
- path: pathfinder/canvas_moire.trace
expectations:
- device: freedreno-a630
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 4bc246b..9080ed7 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -94,6 +94,9 @@
*/
compiler->max_const_compute = 256;
+ /* TODO: implement clip+cull distances on earlier gen's */
+ compiler->has_clip_cull = true;
+
if (compiler->gpu_id == 650)
compiler->tess_use_shared = true;
} else {
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 5d7d140..0c9a2a4 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -105,6 +105,9 @@
* vec4 units):
*/
uint32_t const_upload_unit;
+
+ /* Whether clip+cull distances are supported */
+ bool has_clip_cull;
};
void ir3_compiler_destroy(struct ir3_compiler *compiler);
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 1ee2956..dfb5f29 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -460,7 +460,7 @@
bool layer_zero = so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER);
bool view_zero = so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT);
- if (so->key.ucp_enables)
+ if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull)
progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false);
if (so->key.fclamp_color)
progress |= OPT(s, nir_lower_clamp_color_outputs);
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index b29a66a..bc5cb51 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -338,7 +338,12 @@
key->safe_constlen = true;
- key->ucp_enables = 0xff;
+ /* When clip/cull distances are natively supported, we only use
+ * ucp_enables to determine whether to lower legacy clip planes to
+ * gl_ClipDistance.
+ */
+ if (info->stage != MESA_SHADER_FRAGMENT || !shader->compiler->has_clip_cull)
+ key->ucp_enables = 0xff;
if (info->stage == MESA_SHADER_FRAGMENT) {
key->fsaturate_s = ~0;
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
index 5b6c2ca..406ad0b 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c
@@ -307,6 +307,7 @@
bool binning_pass)
{
uint32_t pos_regid, psize_regid, color_regid[8], posz_regid;
+ uint32_t clip0_regid, clip1_regid;
uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
uint32_t smask_in_regid, smask_regid;
uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid;
@@ -316,6 +317,7 @@
uint32_t gs_header_regid;
enum a3xx_threadsize fssz;
uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0;
+ uint8_t clip0_loc, clip1_loc;
int i, j;
static const struct ir3_shader_variant dummy_fs = {0};
@@ -337,6 +339,8 @@
pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS);
psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ);
+ clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0);
+ clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1);
vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
@@ -349,6 +353,8 @@
pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS);
psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ);
+ clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0);
+ clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1);
} else {
tess_coord_x_regid = regid(63, 0);
tess_coord_y_regid = regid(63, 0);
@@ -362,6 +368,8 @@
primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID);
pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS);
psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ);
+ clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0);
+ clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1);
layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER);
} else {
gs_header_regid = regid(63, 0);
@@ -464,6 +472,8 @@
const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0);
+ uint8_t clip_mask = last_shader->clip_mask, cull_mask = last_shader->cull_mask;
+ uint8_t clip_cull_mask = clip_mask | cull_mask;
/* If we have streamout, link against the real FS, rather than the
* dummy FS used for binning pass state, to ensure the OUTLOC's
@@ -475,6 +485,8 @@
ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true);
bool primid_passthru = l.primid_loc != 0xff;
+ clip0_loc = l.clip0_loc;
+ clip1_loc = l.clip1_loc;
OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */
@@ -500,6 +512,20 @@
ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
}
+ /* Handle the case where clip/cull distances aren't read by the FS. Make
+ * sure to avoid adding an output with an empty writemask if the user
+ * disables all the clip distances in the API so that the slot is unused.
+ */
+ if (clip0_loc == 0xff && VALIDREG(clip0_regid) && (clip_cull_mask & 0xf) != 0) {
+ clip0_loc = l.max_loc;
+ ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc);
+ }
+
+ if (clip1_loc == 0xff && VALIDREG(clip1_regid) && (clip_cull_mask >> 4) != 0) {
+ clip1_loc = l.max_loc;
+ ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc);
+ }
+
/* If we have stream-out, we use the full shader for binning
* pass, rather than the optimized binning pass one, so that we
* have all the varying outputs available for xfb. So streamout
@@ -602,7 +628,9 @@
A6XX_PC_TESS_CNTL_OUTPUT(output));
OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1);
- OUT_RING(ring, 0x00ffff00);
+ OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+ A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+ A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1);
OUT_RING(ring, 0x0000ffff);
@@ -611,7 +639,8 @@
OUT_RING(ring, 0x0);
OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1);
- OUT_RING(ring, 0x0);
+ OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) |
+ A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask));
OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1);
OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) |
@@ -628,7 +657,8 @@
OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1);
OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
- CONDREG(psize_regid, 0x100));
+ CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) |
+ A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
} else {
OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
@@ -648,7 +678,8 @@
OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1);
OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
- CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE));
+ CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
+ A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1);
OUT_RING(ring, 0);
@@ -785,7 +816,8 @@
OUT_RING(ring, A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) |
CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) |
CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) |
- CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID));
+ CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) |
+ A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
uint32_t output;
switch (gs->shader->nir->info.gs.output_primitive) {
@@ -808,13 +840,16 @@
A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(gs->shader->nir->info.gs.invocations - 1));
OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1);
- OUT_RING(ring, 0);
+ OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) |
+ A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask));
OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1);
OUT_RING(ring, 0xff);
OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1);
- OUT_RING(ring, 0xffff00);
+ OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+ A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+ A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs;
@@ -838,7 +873,13 @@
}
OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1);
- OUT_RING(ring, 0xffff00);
+ OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
+ A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
+ A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
+
+ OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1);
+ OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) |
+ A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask));
OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1);
OUT_RING(ring, 0);
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
index 1f6bf70..3532477 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c
@@ -58,8 +58,7 @@
.unk5 = !cso->depth_clip_near || !cso->depth_clip_far,
.vp_clip_code_ignore = 1,
.zero_gb_scale_z = cso->clip_halfz
- ),
- A6XX_GRAS_VS_CL_CNTL());
+ ));
OUT_REG(ring,
A6XX_GRAS_SU_CNTL(
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index bea2399..08dbcf5 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -461,6 +461,8 @@
return fd_device_version(screen->dev) >= FD_VERSION_FENCE_FD;
case PIPE_CAP_FENCE_SIGNAL:
return screen->has_syncobj;
+ case PIPE_CAP_CULL_DISTANCE:
+ return is_a6xx(screen);
default:
return u_pipe_screen_get_param_defaults(pscreen, param);
}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 810c3b2..cb28ed5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -188,6 +188,7 @@
*/
struct ir3_shader_key key = {
.tessellation = IR3_TESS_NONE,
+ .ucp_enables = MASK(nir->info.clip_distance_array_size),
.msaa = true,
};