radv: force persample shading when required.

We need to force persample shading when
a) shader uses sample_id
b) shader uses sample_position
c) shader uses sample qualifier.

Also since ps_iter_samples can now change independently of the
rasterizer samples we need to move setting the regs more often.

This fixes:
dEQP-VK.pipeline.multisample_interpolation.centroid_interpolate_at_consistency.*
dEQP-VK.pipeline.multisample_interpolation.centroid_qualifier_inside_primitive.137_191_1.*
dEQP-VK.pipeline.multisample_interpolation.sample_interpolate_at_distinct_values.*
dEQP-VK.pipeline.multisample_interpolation.sample_qualifier_distinct_values.128_128_1.*

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index d9eef57..ee98f5f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -2940,9 +2940,11 @@
 		result = ctx->start_instance;
 		break;
 	case nir_intrinsic_load_sample_id:
+		ctx->shader_info->fs.force_persample = true;
 		result = unpack_param(ctx, ctx->ancillary, 8, 4);
 		break;
 	case nir_intrinsic_load_sample_pos:
+		ctx->shader_info->fs.force_persample = true;
 		result = load_sample_pos(ctx);
 		break;
 	case nir_intrinsic_load_front_face:
@@ -3959,9 +3961,18 @@
 	variable->data.driver_location = idx * 4;
 	ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location;
 
-	if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT)
-		interp = lookup_interp_param(ctx, variable->data.interpolation, INTERP_CENTER);
-	else
+	if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
+		unsigned interp_type;
+		if (variable->data.sample) {
+			interp_type = INTERP_SAMPLE;
+			ctx->shader_info->fs.force_persample = true;
+		} else if (variable->data.centroid)
+			interp_type = INTERP_CENTROID;
+		else
+			interp_type = INTERP_CENTER;
+
+		interp = lookup_interp_param(ctx, variable->data.interpolation, interp_type);
+	} else
 		interp = NULL;
 
 	for (unsigned i = 0; i < attrib_count; ++i)
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index ca06d05..f33519c 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -81,6 +81,7 @@
 			bool writes_stencil;
 			bool early_fragment_test;
 			bool writes_memory;
+			bool force_persample;
 		} fs;
 		struct {
 			unsigned block_size[3];
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index d1e4deb..0eda0bc 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -288,6 +288,9 @@
 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[0]);
 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_mask[1]);
 
+	radeon_set_context_reg(cmd_buffer->cs, CM_R_028804_DB_EQAA, ms->db_eqaa);
+	radeon_set_context_reg(cmd_buffer->cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
+
 	if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
 		return;
 
@@ -295,9 +298,6 @@
 	radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
 	radeon_emit(cmd_buffer->cs, ms->pa_sc_aa_config);
 
-	radeon_set_context_reg(cmd_buffer->cs, CM_R_028804_DB_EQAA, ms->db_eqaa);
-	radeon_set_context_reg(cmd_buffer->cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
-
 	radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
 
 	uint32_t samples_offset;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 7d7d0c6..23ed2d2 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1022,6 +1022,11 @@
 	uint32_t mask = 0xffff;
 
 	ms->num_samples = vkms->rasterizationSamples;
+
+	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.force_persample) {
+		ps_iter_samples = vkms->rasterizationSamples;
+	}
+
 	ms->pa_sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
 	ms->pa_sc_aa_config = 0;
 	ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |