drm/radeon/kms: don't require up to 64k allocations. (v2)

This avoids needing to do a kmalloc > PAGE_SIZE for the main
indirect buffer chunk, it adds an accessor for all reads from
the chunk and caches a single page at a time for subsequent
reads.

changes since v1:
Use a two page pool which should be the most common case
a single packet spanning > PAGE_SIZE will be hit, but I'm
having trouble seeing anywhere we currently generate anything like that.
hopefully proper short page copying at end
added parser_error flag to set deep errors instead of having to test
every ib value fetch.
fixed bug in patch that went to list.

Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index bb151ec..1ebea8c 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -697,17 +697,18 @@
 		struct radeon_cs_packet *pkt,
 		unsigned idx, unsigned reg)
 {
-	struct radeon_cs_chunk *ib_chunk;
 	struct radeon_cs_reloc *reloc;
 	struct r100_cs_track *track;
 	volatile uint32_t *ib;
 	uint32_t tmp, tile_flags = 0;
 	unsigned i;
 	int r;
+	u32 idx_value;
 
 	ib = p->ib->ptr;
-	ib_chunk = &p->chunks[p->chunk_ib_idx];
 	track = (struct r100_cs_track *)p->track;
+	idx_value = radeon_get_ib_value(p, idx);
+
 	switch(reg) {
 	case AVIVO_D1MODE_VLINE_START_END:
 	case RADEON_CRTC_GUI_TRIG_VLINE:
@@ -738,8 +739,8 @@
 			return r;
 		}
 		track->cb[i].robj = reloc->robj;
-		track->cb[i].offset = ib_chunk->kdata[idx];
-		ib[idx] = ib_chunk->kdata[idx] + ((u32)reloc->lobj.gpu_offset);
+		track->cb[i].offset = idx_value;
+		ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
 		break;
 	case R300_ZB_DEPTHOFFSET:
 		r = r100_cs_packet_next_reloc(p, &reloc);
@@ -750,8 +751,8 @@
 			return r;
 		}
 		track->zb.robj = reloc->robj;
-		track->zb.offset = ib_chunk->kdata[idx];
-		ib[idx] = ib_chunk->kdata[idx] + ((u32)reloc->lobj.gpu_offset);
+		track->zb.offset = idx_value;
+		ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
 		break;
 	case R300_TX_OFFSET_0:
 	case R300_TX_OFFSET_0+4:
@@ -777,32 +778,32 @@
 			r100_cs_dump_packet(p, pkt);
 			return r;
 		}
-		ib[idx] = ib_chunk->kdata[idx] + ((u32)reloc->lobj.gpu_offset);
+		ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
 		track->textures[i].robj = reloc->robj;
 		break;
 	/* Tracked registers */
 	case 0x2084:
 		/* VAP_VF_CNTL */
-		track->vap_vf_cntl = ib_chunk->kdata[idx];
+		track->vap_vf_cntl = idx_value;
 		break;
 	case 0x20B4:
 		/* VAP_VTX_SIZE */
-		track->vtx_size = ib_chunk->kdata[idx] & 0x7F;
+		track->vtx_size = idx_value & 0x7F;
 		break;
 	case 0x2134:
 		/* VAP_VF_MAX_VTX_INDX */
-		track->max_indx = ib_chunk->kdata[idx] & 0x00FFFFFFUL;
+		track->max_indx = idx_value & 0x00FFFFFFUL;
 		break;
 	case 0x43E4:
 		/* SC_SCISSOR1 */
-		track->maxy = ((ib_chunk->kdata[idx] >> 13) & 0x1FFF) + 1;
+		track->maxy = ((idx_value >> 13) & 0x1FFF) + 1;
 		if (p->rdev->family < CHIP_RV515) {
 			track->maxy -= 1440;
 		}
 		break;
 	case 0x4E00:
 		/* RB3D_CCTL */
-		track->num_cb = ((ib_chunk->kdata[idx] >> 5) & 0x3) + 1;
+		track->num_cb = ((idx_value >> 5) & 0x3) + 1;
 		break;
 	case 0x4E38:
 	case 0x4E3C:
@@ -825,13 +826,13 @@
 		if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO)
 			tile_flags |= R300_COLOR_MICROTILE_ENABLE;
 
-		tmp = ib_chunk->kdata[idx] & ~(0x7 << 16);
+		tmp = idx_value & ~(0x7 << 16);
 		tmp |= tile_flags;
 		ib[idx] = tmp;
 
 		i = (reg - 0x4E38) >> 2;
-		track->cb[i].pitch = ib_chunk->kdata[idx] & 0x3FFE;
-		switch (((ib_chunk->kdata[idx] >> 21) & 0xF)) {
+		track->cb[i].pitch = idx_value & 0x3FFE;
+		switch (((idx_value >> 21) & 0xF)) {
 		case 9:
 		case 11:
 		case 12:
@@ -854,13 +855,13 @@
 			break;
 		default:
 			DRM_ERROR("Invalid color buffer format (%d) !\n",
-				  ((ib_chunk->kdata[idx] >> 21) & 0xF));
+				  ((idx_value >> 21) & 0xF));
 			return -EINVAL;
 		}
 		break;
 	case 0x4F00:
 		/* ZB_CNTL */
-		if (ib_chunk->kdata[idx] & 2) {
+		if (idx_value & 2) {
 			track->z_enabled = true;
 		} else {
 			track->z_enabled = false;
@@ -868,7 +869,7 @@
 		break;
 	case 0x4F10:
 		/* ZB_FORMAT */
-		switch ((ib_chunk->kdata[idx] & 0xF)) {
+		switch ((idx_value & 0xF)) {
 		case 0:
 		case 1:
 			track->zb.cpp = 2;
@@ -878,7 +879,7 @@
 			break;
 		default:
 			DRM_ERROR("Invalid z buffer format (%d) !\n",
-				  (ib_chunk->kdata[idx] & 0xF));
+				  (idx_value & 0xF));
 			return -EINVAL;
 		}
 		break;
@@ -897,17 +898,17 @@
 		if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO)
 			tile_flags |= R300_DEPTHMICROTILE_TILED;;
 
-		tmp = ib_chunk->kdata[idx] & ~(0x7 << 16);
+		tmp = idx_value & ~(0x7 << 16);
 		tmp |= tile_flags;
 		ib[idx] = tmp;
 
-		track->zb.pitch = ib_chunk->kdata[idx] & 0x3FFC;
+		track->zb.pitch = idx_value & 0x3FFC;
 		break;
 	case 0x4104:
 		for (i = 0; i < 16; i++) {
 			bool enabled;
 
-			enabled = !!(ib_chunk->kdata[idx] & (1 << i));
+			enabled = !!(idx_value & (1 << i));
 			track->textures[i].enabled = enabled;
 		}
 		break;
@@ -929,9 +930,9 @@
 	case 0x44FC:
 		/* TX_FORMAT1_[0-15] */
 		i = (reg - 0x44C0) >> 2;
-		tmp = (ib_chunk->kdata[idx] >> 25) & 0x3;
+		tmp = (idx_value >> 25) & 0x3;
 		track->textures[i].tex_coord_type = tmp;
-		switch ((ib_chunk->kdata[idx] & 0x1F)) {
+		switch ((idx_value & 0x1F)) {
 		case R300_TX_FORMAT_X8:
 		case R300_TX_FORMAT_Y4X4:
 		case R300_TX_FORMAT_Z3Y3X2:
@@ -971,7 +972,7 @@
 			break;
 		default:
 			DRM_ERROR("Invalid texture format %u\n",
-				  (ib_chunk->kdata[idx] & 0x1F));
+				  (idx_value & 0x1F));
 			return -EINVAL;
 			break;
 		}
@@ -994,11 +995,11 @@
 	case 0x443C:
 		/* TX_FILTER0_[0-15] */
 		i = (reg - 0x4400) >> 2;
-		tmp = ib_chunk->kdata[idx] & 0x7;
+		tmp = idx_value & 0x7;
 		if (tmp == 2 || tmp == 4 || tmp == 6) {
 			track->textures[i].roundup_w = false;
 		}
-		tmp = (ib_chunk->kdata[idx] >> 3) & 0x7;
+		tmp = (idx_value >> 3) & 0x7;
 		if (tmp == 2 || tmp == 4 || tmp == 6) {
 			track->textures[i].roundup_h = false;
 		}
@@ -1021,12 +1022,12 @@
 	case 0x453C:
 		/* TX_FORMAT2_[0-15] */
 		i = (reg - 0x4500) >> 2;
-		tmp = ib_chunk->kdata[idx] & 0x3FFF;
+		tmp = idx_value & 0x3FFF;
 		track->textures[i].pitch = tmp + 1;
 		if (p->rdev->family >= CHIP_RV515) {
-			tmp = ((ib_chunk->kdata[idx] >> 15) & 1) << 11;
+			tmp = ((idx_value >> 15) & 1) << 11;
 			track->textures[i].width_11 = tmp;
-			tmp = ((ib_chunk->kdata[idx] >> 16) & 1) << 11;
+			tmp = ((idx_value >> 16) & 1) << 11;
 			track->textures[i].height_11 = tmp;
 		}
 		break;
@@ -1048,15 +1049,15 @@
 	case 0x44BC:
 		/* TX_FORMAT0_[0-15] */
 		i = (reg - 0x4480) >> 2;
-		tmp = ib_chunk->kdata[idx] & 0x7FF;
+		tmp = idx_value & 0x7FF;
 		track->textures[i].width = tmp + 1;
-		tmp = (ib_chunk->kdata[idx] >> 11) & 0x7FF;
+		tmp = (idx_value >> 11) & 0x7FF;
 		track->textures[i].height = tmp + 1;
-		tmp = (ib_chunk->kdata[idx] >> 26) & 0xF;
+		tmp = (idx_value >> 26) & 0xF;
 		track->textures[i].num_levels = tmp;
-		tmp = ib_chunk->kdata[idx] & (1 << 31);
+		tmp = idx_value & (1 << 31);
 		track->textures[i].use_pitch = !!tmp;
-		tmp = (ib_chunk->kdata[idx] >> 22) & 0xF;
+		tmp = (idx_value >> 22) & 0xF;
 		track->textures[i].txdepth = tmp;
 		break;
 	case R300_ZB_ZPASS_ADDR:
@@ -1067,7 +1068,7 @@
 			r100_cs_dump_packet(p, pkt);
 			return r;
 		}
-		ib[idx] = ib_chunk->kdata[idx] + ((u32)reloc->lobj.gpu_offset);
+		ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
 		break;
 	case 0x4be8:
 		/* valid register only on RV530 */
@@ -1085,60 +1086,20 @@
 static int r300_packet3_check(struct radeon_cs_parser *p,
 			      struct radeon_cs_packet *pkt)
 {
-	struct radeon_cs_chunk *ib_chunk;
-
 	struct radeon_cs_reloc *reloc;
 	struct r100_cs_track *track;
 	volatile uint32_t *ib;
 	unsigned idx;
-	unsigned i, c;
 	int r;
 
 	ib = p->ib->ptr;
-	ib_chunk = &p->chunks[p->chunk_ib_idx];
 	idx = pkt->idx + 1;
 	track = (struct r100_cs_track *)p->track;
 	switch(pkt->opcode) {
 	case PACKET3_3D_LOAD_VBPNTR:
-		c = ib_chunk->kdata[idx++] & 0x1F;
-		track->num_arrays = c;
-		for (i = 0; i < (c - 1); i+=2, idx+=3) {
-			r = r100_cs_packet_next_reloc(p, &reloc);
-			if (r) {
-				DRM_ERROR("No reloc for packet3 %d\n",
-					  pkt->opcode);
-				r100_cs_dump_packet(p, pkt);
-				return r;
-			}
-			ib[idx+1] = ib_chunk->kdata[idx+1] + ((u32)reloc->lobj.gpu_offset);
-			track->arrays[i + 0].robj = reloc->robj;
-			track->arrays[i + 0].esize = ib_chunk->kdata[idx] >> 8;
-			track->arrays[i + 0].esize &= 0x7F;
-			r = r100_cs_packet_next_reloc(p, &reloc);
-			if (r) {
-				DRM_ERROR("No reloc for packet3 %d\n",
-					  pkt->opcode);
-				r100_cs_dump_packet(p, pkt);
-				return r;
-			}
-			ib[idx+2] = ib_chunk->kdata[idx+2] + ((u32)reloc->lobj.gpu_offset);
-			track->arrays[i + 1].robj = reloc->robj;
-			track->arrays[i + 1].esize = ib_chunk->kdata[idx] >> 24;
-			track->arrays[i + 1].esize &= 0x7F;
-		}
-		if (c & 1) {
-			r = r100_cs_packet_next_reloc(p, &reloc);
-			if (r) {
-				DRM_ERROR("No reloc for packet3 %d\n",
-					  pkt->opcode);
-				r100_cs_dump_packet(p, pkt);
-				return r;
-			}
-			ib[idx+1] = ib_chunk->kdata[idx+1] + ((u32)reloc->lobj.gpu_offset);
-			track->arrays[i + 0].robj = reloc->robj;
-			track->arrays[i + 0].esize = ib_chunk->kdata[idx] >> 8;
-			track->arrays[i + 0].esize &= 0x7F;
-		}
+		r = r100_packet3_load_vbpntr(p, pkt, idx);
+		if (r)
+			return r;
 		break;
 	case PACKET3_INDX_BUFFER:
 		r = r100_cs_packet_next_reloc(p, &reloc);
@@ -1147,7 +1108,7 @@
 			r100_cs_dump_packet(p, pkt);
 			return r;
 		}
-		ib[idx+1] = ib_chunk->kdata[idx+1] + ((u32)reloc->lobj.gpu_offset);
+		ib[idx+1] = radeon_get_ib_value(p, idx + 1) + ((u32)reloc->lobj.gpu_offset);
 		r = r100_cs_track_check_pkt3_indx_buffer(p, pkt, reloc->robj);
 		if (r) {
 			return r;
@@ -1158,11 +1119,11 @@
 		/* Number of dwords is vtx_size * (num_vertices - 1)
 		 * PRIM_WALK must be equal to 3 vertex data in embedded
 		 * in cmd stream */
-		if (((ib_chunk->kdata[idx+1] >> 4) & 0x3) != 3) {
+		if (((radeon_get_ib_value(p, idx + 1) >> 4) & 0x3) != 3) {
 			DRM_ERROR("PRIM_WALK must be 3 for IMMD draw\n");
 			return -EINVAL;
 		}
-		track->vap_vf_cntl = ib_chunk->kdata[idx+1];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx + 1);
 		track->immd_dwords = pkt->count - 1;
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
@@ -1173,11 +1134,11 @@
 		/* Number of dwords is vtx_size * (num_vertices - 1)
 		 * PRIM_WALK must be equal to 3 vertex data in embedded
 		 * in cmd stream */
-		if (((ib_chunk->kdata[idx] >> 4) & 0x3) != 3) {
+		if (((radeon_get_ib_value(p, idx) >> 4) & 0x3) != 3) {
 			DRM_ERROR("PRIM_WALK must be 3 for IMMD draw\n");
 			return -EINVAL;
 		}
-		track->vap_vf_cntl = ib_chunk->kdata[idx];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx);
 		track->immd_dwords = pkt->count;
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
@@ -1185,28 +1146,28 @@
 		}
 		break;
 	case PACKET3_3D_DRAW_VBUF:
-		track->vap_vf_cntl = ib_chunk->kdata[idx + 1];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx + 1);
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
 			return r;
 		}
 		break;
 	case PACKET3_3D_DRAW_VBUF_2:
-		track->vap_vf_cntl = ib_chunk->kdata[idx];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx);
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
 			return r;
 		}
 		break;
 	case PACKET3_3D_DRAW_INDX:
-		track->vap_vf_cntl = ib_chunk->kdata[idx + 1];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx + 1);
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
 			return r;
 		}
 		break;
 	case PACKET3_3D_DRAW_INDX_2:
-		track->vap_vf_cntl = ib_chunk->kdata[idx];
+		track->vap_vf_cntl = radeon_get_ib_value(p, idx);
 		r = r100_cs_track_check(p->rdev, track);
 		if (r) {
 			return r;