drm/radeon/kms: add support for streamout v7

v2: agd5f: add strmout CS checking, copy_dw register checking

v3: agd5f: don't use cs_check_reg() for copy_dw checking as it
will incorrectly patch the command stream for certain regs.

v4: agd5f: add warning if safe reg check fails for copy_dw

v5: agd5f: add stricter checking for 6xx/7xx

v6: agd5f: add range checking for copy_dw on eg+,
add sx_surface_sync to safe reg list for 7xx.

v7: agd5f: add stricter checking for eg+

Signed-off-by: Marek Olšák <maraeo@gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c
index 8e8cd85..4e2cadc 100644
--- a/drivers/gpu/drm/radeon/evergreen_cs.c
+++ b/drivers/gpu/drm/radeon/evergreen_cs.c
@@ -60,6 +60,10 @@
 	u32			cb_shader_mask;
 	u32			vgt_strmout_config;
 	u32			vgt_strmout_buffer_config;
+	struct radeon_bo	*vgt_strmout_bo[4];
+	u64			vgt_strmout_bo_mc[4];
+	u32			vgt_strmout_bo_offset[4];
+	u32			vgt_strmout_size[4];
 	u32			db_depth_control;
 	u32			db_depth_view;
 	u32			db_depth_size;
@@ -159,16 +163,41 @@
 	track->db_s_write_offset = 0xFFFFFFFF;
 	track->db_s_read_bo = NULL;
 	track->db_s_write_bo = NULL;
+
+	for (i = 0; i < 4; i++) {
+		track->vgt_strmout_size[i] = 0;
+		track->vgt_strmout_bo[i] = NULL;
+		track->vgt_strmout_bo_offset[i] = 0xFFFFFFFF;
+		track->vgt_strmout_bo_mc[i] = 0xFFFFFFFF;
+	}
 }
 
 static int evergreen_cs_track_check(struct radeon_cs_parser *p)
 {
 	struct evergreen_cs_track *track = p->track;
+	int i, j;
 
-	/* we don't support stream out buffer yet */
-	if (track->vgt_strmout_config || track->vgt_strmout_buffer_config) {
-		dev_warn(p->dev, "this kernel doesn't support SMX output buffer\n");
-		return -EINVAL;
+	/* check streamout */
+	for (i = 0; i < 4; i++) {
+		if (track->vgt_strmout_config & (1 << i)) {
+			for (j = 0; j < 4; j++) {
+				if ((track->vgt_strmout_buffer_config >> (i * 4)) & (1 << j)) {
+					if (track->vgt_strmout_bo[j]) {
+						u64 offset = (u64)track->vgt_strmout_bo_offset[j] +
+							(u64)track->vgt_strmout_size[j];
+						if (offset > radeon_bo_size(track->vgt_strmout_bo[i])) {
+							DRM_ERROR("streamout %d bo too small: 0x%llx, 0x%lx\n",
+								  j, offset,
+								  radeon_bo_size(track->vgt_strmout_bo[j]));
+							return -EINVAL;
+						}
+					} else {
+						dev_warn(p->dev, "No buffer for streamout %d\n", j);
+						return -EINVAL;
+					}
+				}
+			}
+		}
 	}
 
 	/* XXX fill in */
@@ -597,6 +626,38 @@
 	case VGT_STRMOUT_BUFFER_CONFIG:
 		track->vgt_strmout_buffer_config = radeon_get_ib_value(p, idx);
 		break;
+	case VGT_STRMOUT_BUFFER_BASE_0:
+	case VGT_STRMOUT_BUFFER_BASE_1:
+	case VGT_STRMOUT_BUFFER_BASE_2:
+	case VGT_STRMOUT_BUFFER_BASE_3:
+		r = evergreen_cs_packet_next_reloc(p, &reloc);
+		if (r) {
+			dev_warn(p->dev, "bad SET_CONTEXT_REG "
+					"0x%04X\n", reg);
+			return -EINVAL;
+		}
+		tmp = (reg - VGT_STRMOUT_BUFFER_BASE_0) / 16;
+		track->vgt_strmout_bo_offset[tmp] = radeon_get_ib_value(p, idx) << 8;
+		ib[idx] += (u32)((reloc->lobj.gpu_offset >> 8) & 0xffffffff);
+		track->vgt_strmout_bo[tmp] = reloc->robj;
+		track->vgt_strmout_bo_mc[tmp] = reloc->lobj.gpu_offset;
+		break;
+	case VGT_STRMOUT_BUFFER_SIZE_0:
+	case VGT_STRMOUT_BUFFER_SIZE_1:
+	case VGT_STRMOUT_BUFFER_SIZE_2:
+	case VGT_STRMOUT_BUFFER_SIZE_3:
+		tmp = (reg - VGT_STRMOUT_BUFFER_SIZE_0) / 16;
+		/* size in register is DWs, convert to bytes */
+		track->vgt_strmout_size[tmp] = radeon_get_ib_value(p, idx) * 4;
+		break;
+	case CP_COHER_BASE:
+		r = evergreen_cs_packet_next_reloc(p, &reloc);
+		if (r) {
+			dev_warn(p->dev, "missing reloc for CP_COHER_BASE "
+					"0x%04X\n", reg);
+			return -EINVAL;
+		}
+		ib[idx] += (u32)((reloc->lobj.gpu_offset >> 8) & 0xffffffff);
 	case CB_TARGET_MASK:
 		track->cb_target_mask = radeon_get_ib_value(p, idx);
 		break;
@@ -1014,6 +1075,32 @@
 	return 0;
 }
 
+static bool evergreen_is_safe_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
+{
+	u32 last_reg, m, i;
+
+	if (p->rdev->family >= CHIP_CAYMAN)
+		last_reg = ARRAY_SIZE(cayman_reg_safe_bm);
+	else
+		last_reg = ARRAY_SIZE(evergreen_reg_safe_bm);
+
+	i = (reg >> 7);
+	if (i >= last_reg) {
+		dev_warn(p->dev, "forbidden register 0x%08x at %d\n", reg, idx);
+		return false;
+	}
+	m = 1 << ((reg >> 2) & 31);
+	if (p->rdev->family >= CHIP_CAYMAN) {
+		if (!(cayman_reg_safe_bm[i] & m))
+			return true;
+	} else {
+		if (!(evergreen_reg_safe_bm[i] & m))
+			return true;
+	}
+	dev_warn(p->dev, "forbidden register 0x%08x at %d\n", reg, idx);
+	return false;
+}
+
 static int evergreen_packet3_check(struct radeon_cs_parser *p,
 				   struct radeon_cs_packet *pkt)
 {
@@ -1451,6 +1538,100 @@
 			return -EINVAL;
 		}
 		break;
+	case PACKET3_STRMOUT_BUFFER_UPDATE:
+		if (pkt->count != 4) {
+			DRM_ERROR("bad STRMOUT_BUFFER_UPDATE (invalid count)\n");
+			return -EINVAL;
+		}
+		/* Updating memory at DST_ADDRESS. */
+		if (idx_value & 0x1) {
+			u64 offset;
+			r = evergreen_cs_packet_next_reloc(p, &reloc);
+			if (r) {
+				DRM_ERROR("bad STRMOUT_BUFFER_UPDATE (missing dst reloc)\n");
+				return -EINVAL;
+			}
+			offset = radeon_get_ib_value(p, idx+1);
+			offset += ((u64)(radeon_get_ib_value(p, idx+2) & 0xff)) << 32;
+			if ((offset + 4) > radeon_bo_size(reloc->robj)) {
+				DRM_ERROR("bad STRMOUT_BUFFER_UPDATE dst bo too small: 0x%llx, 0x%lx\n",
+					  offset + 4, radeon_bo_size(reloc->robj));
+				return -EINVAL;
+			}
+			ib[idx+1] += (u32)(reloc->lobj.gpu_offset & 0xffffffff);
+			ib[idx+2] += upper_32_bits(reloc->lobj.gpu_offset) & 0xff;
+		}
+		/* Reading data from SRC_ADDRESS. */
+		if (((idx_value >> 1) & 0x3) == 2) {
+			u64 offset;
+			r = evergreen_cs_packet_next_reloc(p, &reloc);
+			if (r) {
+				DRM_ERROR("bad STRMOUT_BUFFER_UPDATE (missing src reloc)\n");
+				return -EINVAL;
+			}
+			offset = radeon_get_ib_value(p, idx+3);
+			offset += ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+			if ((offset + 4) > radeon_bo_size(reloc->robj)) {
+				DRM_ERROR("bad STRMOUT_BUFFER_UPDATE src bo too small: 0x%llx, 0x%lx\n",
+					  offset + 4, radeon_bo_size(reloc->robj));
+				return -EINVAL;
+			}
+			ib[idx+3] += (u32)(reloc->lobj.gpu_offset & 0xffffffff);
+			ib[idx+4] += upper_32_bits(reloc->lobj.gpu_offset) & 0xff;
+		}
+		break;
+	case PACKET3_COPY_DW:
+		if (pkt->count != 4) {
+			DRM_ERROR("bad COPY_DW (invalid count)\n");
+			return -EINVAL;
+		}
+		if (idx_value & 0x1) {
+			u64 offset;
+			/* SRC is memory. */
+			r = evergreen_cs_packet_next_reloc(p, &reloc);
+			if (r) {
+				DRM_ERROR("bad COPY_DW (missing src reloc)\n");
+				return -EINVAL;
+			}
+			offset = radeon_get_ib_value(p, idx+1);
+			offset += ((u64)(radeon_get_ib_value(p, idx+2) & 0xff)) << 32;
+			if ((offset + 4) > radeon_bo_size(reloc->robj)) {
+				DRM_ERROR("bad COPY_DW src bo too small: 0x%llx, 0x%lx\n",
+					  offset + 4, radeon_bo_size(reloc->robj));
+				return -EINVAL;
+			}
+			ib[idx+1] += (u32)(reloc->lobj.gpu_offset & 0xffffffff);
+			ib[idx+2] += upper_32_bits(reloc->lobj.gpu_offset) & 0xff;
+		} else {
+			/* SRC is a reg. */
+			reg = radeon_get_ib_value(p, idx+1) << 2;
+			if (!evergreen_is_safe_reg(p, reg, idx+1))
+				return -EINVAL;
+		}
+		if (idx_value & 0x2) {
+			u64 offset;
+			/* DST is memory. */
+			r = evergreen_cs_packet_next_reloc(p, &reloc);
+			if (r) {
+				DRM_ERROR("bad COPY_DW (missing dst reloc)\n");
+				return -EINVAL;
+			}
+			offset = radeon_get_ib_value(p, idx+3);
+			offset += ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+			if ((offset + 4) > radeon_bo_size(reloc->robj)) {
+				DRM_ERROR("bad COPY_DW dst bo too small: 0x%llx, 0x%lx\n",
+					  offset + 4, radeon_bo_size(reloc->robj));
+				return -EINVAL;
+			}
+			ib[idx+3] += (u32)(reloc->lobj.gpu_offset & 0xffffffff);
+			ib[idx+4] += upper_32_bits(reloc->lobj.gpu_offset) & 0xff;
+		} else {
+			/* DST is a reg. */
+			reg = radeon_get_ib_value(p, idx+3) << 2;
+			if (!evergreen_is_safe_reg(p, reg, idx+3))
+				return -EINVAL;
+		}
+		break;
 	case PACKET3_NOP:
 		break;
 	default: