drm/radeon: fixup tiling group size and backendmap on r6xx-r9xx (v4)

Tiling group size is always 256bits on r6xx/r7xx/r8xx/9xx. Also fix and
simplify render backend map. This now properly sets up the backend map
on r6xx-9xx which should improve 3D performance.

Vadim benchmarked also:
Some benchmarks on juniper (5750), fullscreen 1920x1080,
first result - kernel 3.4.0+ (fb21affa), second - with these patches:

Lightsmark:   91 fps => 123 fps    +35%
Doom3:        74 fps => 101 fps    +36%

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index f388a1d..45cfcea 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -1376,113 +1376,51 @@
 	return r600_gpu_soft_reset(rdev);
 }
 
-static u32 r600_get_tile_pipe_to_backend_map(u32 num_tile_pipes,
-					     u32 num_backends,
-					     u32 backend_disable_mask)
+u32 r6xx_remap_render_backend(struct radeon_device *rdev,
+			      u32 tiling_pipe_num,
+			      u32 max_rb_num,
+			      u32 total_max_rb_num,
+			      u32 disabled_rb_mask)
 {
-	u32 backend_map = 0;
-	u32 enabled_backends_mask;
-	u32 enabled_backends_count;
-	u32 cur_pipe;
-	u32 swizzle_pipe[R6XX_MAX_PIPES];
-	u32 cur_backend;
-	u32 i;
+	u32 rendering_pipe_num, rb_num_width, req_rb_num;
+	u32 pipe_rb_ratio, pipe_rb_remain;
+	u32 data = 0, mask = 1 << (max_rb_num - 1);
+	unsigned i, j;
 
-	if (num_tile_pipes > R6XX_MAX_PIPES)
-		num_tile_pipes = R6XX_MAX_PIPES;
-	if (num_tile_pipes < 1)
-		num_tile_pipes = 1;
-	if (num_backends > R6XX_MAX_BACKENDS)
-		num_backends = R6XX_MAX_BACKENDS;
-	if (num_backends < 1)
-		num_backends = 1;
+	/* mask out the RBs that don't exist on that asic */
+	disabled_rb_mask |= (0xff << max_rb_num) & 0xff;
 
-	enabled_backends_mask = 0;
-	enabled_backends_count = 0;
-	for (i = 0; i < R6XX_MAX_BACKENDS; ++i) {
-		if (((backend_disable_mask >> i) & 1) == 0) {
-			enabled_backends_mask |= (1 << i);
-			++enabled_backends_count;
+	rendering_pipe_num = 1 << tiling_pipe_num;
+	req_rb_num = total_max_rb_num - r600_count_pipe_bits(disabled_rb_mask);
+	BUG_ON(rendering_pipe_num < req_rb_num);
+
+	pipe_rb_ratio = rendering_pipe_num / req_rb_num;
+	pipe_rb_remain = rendering_pipe_num - pipe_rb_ratio * req_rb_num;
+
+	if (rdev->family <= CHIP_RV740) {
+		/* r6xx/r7xx */
+		rb_num_width = 2;
+	} else {
+		/* eg+ */
+		rb_num_width = 4;
+	}
+
+	for (i = 0; i < max_rb_num; i++) {
+		if (!(mask & disabled_rb_mask)) {
+			for (j = 0; j < pipe_rb_ratio; j++) {
+				data <<= rb_num_width;
+				data |= max_rb_num - i - 1;
+			}
+			if (pipe_rb_remain) {
+				data <<= rb_num_width;
+				data |= max_rb_num - i - 1;
+				pipe_rb_remain--;
+			}
 		}
-		if (enabled_backends_count == num_backends)
-			break;
+		mask >>= 1;
 	}
 
-	if (enabled_backends_count == 0) {
-		enabled_backends_mask = 1;
-		enabled_backends_count = 1;
-	}
-
-	if (enabled_backends_count != num_backends)
-		num_backends = enabled_backends_count;
-
-	memset((uint8_t *)&swizzle_pipe[0], 0, sizeof(u32) * R6XX_MAX_PIPES);
-	switch (num_tile_pipes) {
-	case 1:
-		swizzle_pipe[0] = 0;
-		break;
-	case 2:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 1;
-		break;
-	case 3:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 1;
-		swizzle_pipe[2] = 2;
-		break;
-	case 4:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 1;
-		swizzle_pipe[2] = 2;
-		swizzle_pipe[3] = 3;
-		break;
-	case 5:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 1;
-		swizzle_pipe[2] = 2;
-		swizzle_pipe[3] = 3;
-		swizzle_pipe[4] = 4;
-		break;
-	case 6:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 2;
-		swizzle_pipe[2] = 4;
-		swizzle_pipe[3] = 5;
-		swizzle_pipe[4] = 1;
-		swizzle_pipe[5] = 3;
-		break;
-	case 7:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 2;
-		swizzle_pipe[2] = 4;
-		swizzle_pipe[3] = 6;
-		swizzle_pipe[4] = 1;
-		swizzle_pipe[5] = 3;
-		swizzle_pipe[6] = 5;
-		break;
-	case 8:
-		swizzle_pipe[0] = 0;
-		swizzle_pipe[1] = 2;
-		swizzle_pipe[2] = 4;
-		swizzle_pipe[3] = 6;
-		swizzle_pipe[4] = 1;
-		swizzle_pipe[5] = 3;
-		swizzle_pipe[6] = 5;
-		swizzle_pipe[7] = 7;
-		break;
-	}
-
-	cur_backend = 0;
-	for (cur_pipe = 0; cur_pipe < num_tile_pipes; ++cur_pipe) {
-		while (((1 << cur_backend) & enabled_backends_mask) == 0)
-			cur_backend = (cur_backend + 1) % R6XX_MAX_BACKENDS;
-
-		backend_map |= (u32)(((cur_backend & 3) << (swizzle_pipe[cur_pipe] * 2)));
-
-		cur_backend = (cur_backend + 1) % R6XX_MAX_BACKENDS;
-	}
-
-	return backend_map;
+	return data;
 }
 
 int r600_count_pipe_bits(uint32_t val)
@@ -1500,7 +1438,6 @@
 {
 	u32 tiling_config;
 	u32 ramcfg;
-	u32 backend_map;
 	u32 cc_rb_backend_disable;
 	u32 cc_gc_shader_pipe_config;
 	u32 tmp;
@@ -1511,8 +1448,9 @@
 	u32 sq_thread_resource_mgmt = 0;
 	u32 sq_stack_resource_mgmt_1 = 0;
 	u32 sq_stack_resource_mgmt_2 = 0;
+	u32 disabled_rb_mask;
 
-	/* FIXME: implement */
+	rdev->config.r600.tiling_group_size = 256;
 	switch (rdev->family) {
 	case CHIP_R600:
 		rdev->config.r600.max_pipes = 4;
@@ -1616,10 +1554,7 @@
 	rdev->config.r600.tiling_nbanks = 4 << ((ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT);
 	tiling_config |= BANK_TILING((ramcfg & NOOFBANK_MASK) >> NOOFBANK_SHIFT);
 	tiling_config |= GROUP_SIZE((ramcfg & BURSTLENGTH_MASK) >> BURSTLENGTH_SHIFT);
-	if ((ramcfg & BURSTLENGTH_MASK) >> BURSTLENGTH_SHIFT)
-		rdev->config.r600.tiling_group_size = 512;
-	else
-		rdev->config.r600.tiling_group_size = 256;
+
 	tmp = (ramcfg & NOOFROWS_MASK) >> NOOFROWS_SHIFT;
 	if (tmp > 3) {
 		tiling_config |= ROW_TILING(3);
@@ -1631,32 +1566,36 @@
 	tiling_config |= BANK_SWAPS(1);
 
 	cc_rb_backend_disable = RREG32(CC_RB_BACKEND_DISABLE) & 0x00ff0000;
-	cc_rb_backend_disable |=
-		BACKEND_DISABLE((R6XX_MAX_BACKENDS_MASK << rdev->config.r600.max_backends) & R6XX_MAX_BACKENDS_MASK);
+	tmp = R6XX_MAX_BACKENDS -
+		r600_count_pipe_bits((cc_rb_backend_disable >> 16) & R6XX_MAX_BACKENDS_MASK);
+	if (tmp < rdev->config.r600.max_backends) {
+		rdev->config.r600.max_backends = tmp;
+	}
 
-	cc_gc_shader_pipe_config = RREG32(CC_GC_SHADER_PIPE_CONFIG) & 0xffffff00;
-	cc_gc_shader_pipe_config |=
-		INACTIVE_QD_PIPES((R6XX_MAX_PIPES_MASK << rdev->config.r600.max_pipes) & R6XX_MAX_PIPES_MASK);
-	cc_gc_shader_pipe_config |=
-		INACTIVE_SIMDS((R6XX_MAX_SIMDS_MASK << rdev->config.r600.max_simds) & R6XX_MAX_SIMDS_MASK);
+	cc_gc_shader_pipe_config = RREG32(CC_GC_SHADER_PIPE_CONFIG) & 0x00ffff00;
+	tmp = R6XX_MAX_PIPES -
+		r600_count_pipe_bits((cc_gc_shader_pipe_config >> 8) & R6XX_MAX_PIPES_MASK);
+	if (tmp < rdev->config.r600.max_pipes) {
+		rdev->config.r600.max_pipes = tmp;
+	}
+	tmp = R6XX_MAX_SIMDS -
+		r600_count_pipe_bits((cc_gc_shader_pipe_config >> 16) & R6XX_MAX_SIMDS_MASK);
+	if (tmp < rdev->config.r600.max_simds) {
+		rdev->config.r600.max_simds = tmp;
+	}
 
-	backend_map = r600_get_tile_pipe_to_backend_map(rdev->config.r600.max_tile_pipes,
-							(R6XX_MAX_BACKENDS -
-							 r600_count_pipe_bits((cc_rb_backend_disable &
-									       R6XX_MAX_BACKENDS_MASK) >> 16)),
-							(cc_rb_backend_disable >> 16));
+	disabled_rb_mask = (RREG32(CC_RB_BACKEND_DISABLE) >> 16) & R6XX_MAX_BACKENDS_MASK;
+	tmp = (tiling_config & PIPE_TILING__MASK) >> PIPE_TILING__SHIFT;
+	tmp = r6xx_remap_render_backend(rdev, tmp, rdev->config.r600.max_backends,
+					R6XX_MAX_BACKENDS, disabled_rb_mask);
+	tiling_config |= tmp << 16;
+	rdev->config.r600.backend_map = tmp;
+
 	rdev->config.r600.tile_config = tiling_config;
-	rdev->config.r600.backend_map = backend_map;
-	tiling_config |= BACKEND_MAP(backend_map);
 	WREG32(GB_TILING_CONFIG, tiling_config);
 	WREG32(DCP_TILING_CONFIG, tiling_config & 0xffff);
 	WREG32(HDP_TILING_CONFIG, tiling_config & 0xffff);
 
-	/* Setup pipes */
-	WREG32(CC_RB_BACKEND_DISABLE, cc_rb_backend_disable);
-	WREG32(CC_GC_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
-	WREG32(GC_USER_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
-
 	tmp = R6XX_MAX_PIPES - r600_count_pipe_bits((cc_gc_shader_pipe_config & INACTIVE_QD_PIPES_MASK) >> 8);
 	WREG32(VGT_OUT_DEALLOC_CNTL, (tmp * 4) & DEALLOC_DIST_MASK);
 	WREG32(VGT_VERTEX_REUSE_BLOCK_CNTL, ((tmp * 4) - 2) & VTX_REUSE_DEPTH_MASK);