radeon/kms: cleanup async dma packet checking

This simplify and cleanup the async dma checking.

Signed-off-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c
index 4a9760a..d8f5d5f 100644
--- a/drivers/gpu/drm/radeon/evergreen_cs.c
+++ b/drivers/gpu/drm/radeon/evergreen_cs.c
@@ -2670,16 +2670,6 @@
 	return 0;
 }
 
-/*
- *  DMA
- */
-
-#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28)
-#define GET_DMA_COUNT(h) ((h) & 0x000fffff)
-#define GET_DMA_T(h) (((h) & 0x00800000) >> 23)
-#define GET_DMA_NEW(h) (((h) & 0x04000000) >> 26)
-#define GET_DMA_MISC(h) (((h) & 0x0700000) >> 20)
-
 /**
  * evergreen_dma_cs_parse() - parse the DMA IB
  * @p:		parser structure holding parsing context.
@@ -2693,9 +2683,9 @@
 {
 	struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
 	struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc;
-	u32 header, cmd, count, tiled, new_cmd, misc;
+	u32 header, cmd, count, sub_cmd;
 	volatile u32 *ib = p->ib.ptr;
-	u32 idx, idx_value;
+	u32 idx;
 	u64 src_offset, dst_offset, dst2_offset;
 	int r;
 
@@ -2709,9 +2699,7 @@
 		header = radeon_get_ib_value(p, idx);
 		cmd = GET_DMA_CMD(header);
 		count = GET_DMA_COUNT(header);
-		tiled = GET_DMA_T(header);
-		new_cmd = GET_DMA_NEW(header);
-		misc = GET_DMA_MISC(header);
+		sub_cmd = GET_DMA_SUB_CMD(header);
 
 		switch (cmd) {
 		case DMA_PACKET_WRITE:
@@ -2720,19 +2708,27 @@
 				DRM_ERROR("bad DMA_PACKET_WRITE\n");
 				return -EINVAL;
 			}
-			if (tiled) {
+			switch (sub_cmd) {
+			/* tiled */
+			case 8:
 				dst_offset = ib[idx+1];
 				dst_offset <<= 8;
 
 				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
 				p->idx += count + 7;
-			} else {
+				break;
+			/* linear */
+			case 0:
 				dst_offset = ib[idx+1];
 				dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32;
 
 				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
 				ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
 				p->idx += count + 3;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, ib[idx+0]);
+				return -EINVAL;
 			}
 			if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
 				dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n",
@@ -2751,338 +2747,330 @@
 				DRM_ERROR("bad DMA_PACKET_COPY\n");
 				return -EINVAL;
 			}
-			if (tiled) {
-				idx_value = radeon_get_ib_value(p, idx + 2);
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2T, frame to fields */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					case 1:
-						/* L2T, T2L partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						p->idx += 12;
-						break;
-					case 3:
-						/* L2T, broadcast */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					case 4:
-						/* L2T, T2L */
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							src_offset = ib[idx+1];
-							src_offset <<= 8;
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							dst_offset = ib[idx+7];
-							dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							src_offset = ib[idx+7];
-							src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							dst_offset = ib[idx+1];
-							dst_offset <<= 8;
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						p->idx += 9;
-						break;
-					case 5:
-						/* T2T partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-						ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						p->idx += 13;
-						break;
-					case 7:
-						/* L2T, broadcast */
-						if (idx_value & (1 << 31)) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset <<= 8;
-						dst2_offset = ib[idx+2];
-						dst2_offset <<= 8;
-						src_offset = ib[idx+8];
-						src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-						ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 10;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				} else {
-					switch (misc) {
-					case 0:
-						/* detile bit */
-						if (idx_value & (1 << 31)) {
-							/* tiled src, linear dst */
-							src_offset = ib[idx+1];
-							src_offset <<= 8;
-							ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-							dst_offset = ib[idx+7];
-							dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						} else {
-							/* linear src, tiled dst */
-							src_offset = ib[idx+7];
-							src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
-							ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-							ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-							dst_offset = ib[idx+1];
-							dst_offset <<= 8;
-							ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-						}
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						p->idx += 9;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
+			switch (sub_cmd) {
+			/* Copy L2L, DW aligned */
+			case 0x00:
+				/* L2L, dw */
+				src_offset = ib[idx+2];
+				src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
 				}
-			} else {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2L, byte */
-						src_offset = ib[idx+2];
-						src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
-						dst_offset = ib[idx+1];
-						dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
-						if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
-								 src_offset + count, radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
-								 dst_offset + count, radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 5;
-						break;
-					case 1:
-						/* L2L, partial */
-						if (p->family < CHIP_CAYMAN) {
-							DRM_ERROR("L2L Partial is cayman only !\n");
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-						ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 5;
+				break;
+			/* Copy L2T/T2L */
+			case 0x08:
+				/* detile bit */
+				if (ib[idx + 2] & (1 << 31)) {
+					/* tiled src, linear dst */
+					src_offset = ib[idx+1];
+					src_offset <<= 8;
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
 
-						p->idx += 9;
-						break;
-					case 4:
-						/* L2L, dw, broadcast */
-						r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-						if (r) {
-							DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
-							return -EINVAL;
-						}
-						dst_offset = ib[idx+1];
-						dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
-						dst2_offset = ib[idx+2];
-						dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32;
-						src_offset = ib[idx+3];
-						src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
-						if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
-								 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
-								 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-							return -EINVAL;
-						}
-						if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-							dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
-								 dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-							return -EINVAL;
-						}
-						ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-						ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
-						ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-						p->idx += 7;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
+					dst_offset = radeon_get_ib_value(p, idx + 7);
+					dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
 				} else {
-					/* L2L, dw */
-					src_offset = ib[idx+2];
-					src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+					/* linear src, tiled dst */
+					src_offset = ib[idx+7];
+					src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
 					dst_offset = ib[idx+1];
-					dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
-					if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-						dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
-							 src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-						return -EINVAL;
-					}
-					if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-						dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
-							 dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-						return -EINVAL;
-					}
-					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-					ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-					ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-					ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-					p->idx += 5;
+					dst_offset <<= 8;
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
 				}
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				p->idx += 9;
+				break;
+			/* Copy L2L, byte aligned */
+			case 0x40:
+				/* L2L, byte */
+				src_offset = ib[idx+2];
+				src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
+				if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
+							src_offset + count, radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
+							dst_offset + count, radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 5;
+				break;
+			/* Copy L2L, partial */
+			case 0x41:
+				/* L2L, partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+				ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+
+				p->idx += 9;
+				break;
+			/* Copy L2L, DW aligned, broadcast */
+			case 0x44:
+				/* L2L, dw, broadcast */
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
+				dst2_offset = ib[idx+2];
+				dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32;
+				src_offset = ib[idx+3];
+				src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
+				ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 7;
+				break;
+			/* Copy L2T Frame to Field */
+			case 0x48:
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			/* Copy L2T/T2L, partial */
+			case 0x49:
+				/* L2T, T2L partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				/* detile bit */
+				if (ib[idx + 2 ] & (1 << 31)) {
+					/* tiled src, linear dst */
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				} else {
+					/* linear src, tiled dst */
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				}
+				p->idx += 12;
+				break;
+			/* Copy L2T broadcast */
+			case 0x4b:
+				/* L2T, broadcast */
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			/* Copy L2T/T2L (tile units) */
+			case 0x4c:
+				/* L2T, T2L */
+				/* detile bit */
+				if (ib[idx + 2] & (1 << 31)) {
+					/* tiled src, linear dst */
+					src_offset = ib[idx+1];
+					src_offset <<= 8;
+					ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+					dst_offset = ib[idx+7];
+					dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+				} else {
+					/* linear src, tiled dst */
+					src_offset = ib[idx+7];
+					src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32;
+					ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+					ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+					dst_offset = ib[idx+1];
+					dst_offset <<= 8;
+					ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				}
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				p->idx += 9;
+				break;
+			/* Copy T2T, partial (tile units) */
+			case 0x4d:
+				/* T2T partial */
+				if (p->family < CHIP_CAYMAN) {
+					DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+				ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				p->idx += 13;
+				break;
+			/* Copy L2T broadcast (tile units) */
+			case 0x4f:
+				/* L2T, broadcast */
+				if (ib[idx + 2] & (1 << 31)) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+				if (r) {
+					DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+					return -EINVAL;
+				}
+				dst_offset = ib[idx+1];
+				dst_offset <<= 8;
+				dst2_offset = ib[idx+2];
+				dst2_offset <<= 8;
+				src_offset = ib[idx+8];
+				src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32;
+				if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+							src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+							dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+					return -EINVAL;
+				}
+				if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+					dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+							dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+					return -EINVAL;
+				}
+				ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+				ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+				ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+				ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+				p->idx += 10;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, ib[idx+0]);
+				return -EINVAL;
 			}
 			break;
 		case DMA_PACKET_CONSTANT_FILL:
@@ -3435,88 +3423,79 @@
 int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 {
 	u32 idx = 0;
-	u32 header, cmd, count, tiled, new_cmd, misc;
+	u32 header, cmd, count, sub_cmd;
 
 	do {
 		header = ib->ptr[idx];
 		cmd = GET_DMA_CMD(header);
 		count = GET_DMA_COUNT(header);
-		tiled = GET_DMA_T(header);
-		new_cmd = GET_DMA_NEW(header);
-		misc = GET_DMA_MISC(header);
+		sub_cmd = GET_DMA_SUB_CMD(header);
 
 		switch (cmd) {
 		case DMA_PACKET_WRITE:
-			if (tiled)
+			switch (sub_cmd) {
+			/* tiled */
+			case 8:
 				idx += count + 7;
-			else
+				break;
+			/* linear */
+			case 0:
 				idx += count + 3;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, ib->ptr[idx]);
+				return -EINVAL;
+			}
 			break;
 		case DMA_PACKET_COPY:
-			if (tiled) {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2T, frame to fields */
-						idx += 10;
-						break;
-					case 1:
-						/* L2T, T2L partial */
-						idx += 12;
-						break;
-					case 3:
-						/* L2T, broadcast */
-						idx += 10;
-						break;
-					case 4:
-						/* L2T, T2L */
-						idx += 9;
-						break;
-					case 5:
-						/* T2T partial */
-						idx += 13;
-						break;
-					case 7:
-						/* L2T, broadcast */
-						idx += 10;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				} else {
-					switch (misc) {
-					case 0:
-						idx += 9;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				}
-			} else {
-				if (new_cmd) {
-					switch (misc) {
-					case 0:
-						/* L2L, byte */
-						idx += 5;
-						break;
-					case 1:
-						/* L2L, partial */
-						idx += 9;
-						break;
-					case 4:
-						/* L2L, dw, broadcast */
-						idx += 7;
-						break;
-					default:
-						DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-						return -EINVAL;
-					}
-				} else {
-					/* L2L, dw */
-					idx += 5;
-				}
+			switch (sub_cmd) {
+			/* Copy L2L, DW aligned */
+			case 0x00:
+				idx += 5;
+				break;
+			/* Copy L2T/T2L */
+			case 0x08:
+				idx += 9;
+				break;
+			/* Copy L2L, byte aligned */
+			case 0x40:
+				idx += 5;
+				break;
+			/* Copy L2L, partial */
+			case 0x41:
+				idx += 9;
+				break;
+			/* Copy L2L, DW aligned, broadcast */
+			case 0x44:
+				idx += 7;
+				break;
+			/* Copy L2T Frame to Field */
+			case 0x48:
+				idx += 10;
+				break;
+			/* Copy L2T/T2L, partial */
+			case 0x49:
+				idx += 12;
+				break;
+			/* Copy L2T broadcast */
+			case 0x4b:
+				idx += 10;
+				break;
+			/* Copy L2T/T2L (tile units) */
+			case 0x4c:
+				idx += 9;
+				break;
+			/* Copy T2T, partial (tile units) */
+			case 0x4d:
+				idx += 13;
+				break;
+			/* Copy L2T broadcast (tile units) */
+			case 0x4f:
+				idx += 10;
+				break;
+			default:
+				DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, ib->ptr[idx]);
+				return -EINVAL;
 			}
 			break;
 		case DMA_PACKET_CONSTANT_FILL: