nv50: force channel vram access through vm

If we ever want to be able to use the 3D engine we have no choice.  It
appears that the tiling setup (required for 3D on G8x) is in the page tables.

The immediate benefit of this change however is that it's now not possible
for a client to use the GPU to render over the top of important engine setup
tables, which also live in VRAM.

G8x VRAM size is limited to 512MiB at the moment, as we use a 1-1 mapping
of real vram pages to their offset within the start of a channel's VRAM
DMA object and only populate a single PDE for VRAM use.
diff --git a/shared-core/nouveau_drm.h b/shared-core/nouveau_drm.h
index 5f07fcb..cf76205 100644
--- a/shared-core/nouveau_drm.h
+++ b/shared-core/nouveau_drm.h
@@ -87,6 +87,10 @@
 #define NOUVEAU_MEM_MAPPED		0x00000100
 #define NOUVEAU_MEM_INSTANCE		0x00000200 /* internal */
 #define NOUVEAU_MEM_NOTIFIER            0x00000400 /* internal */
+#define NOUVEAU_MEM_NOVM		0x00000800 /* internal */
+#define NOUVEAU_MEM_INTERNAL (NOUVEAU_MEM_INSTANCE | \
+			      NOUVEAU_MEM_NOTIFIER | \
+			      NOUVEAU_MEM_NOVM)
 
 struct drm_nouveau_mem_alloc {
 	int flags;
diff --git a/shared-core/nouveau_drv.h b/shared-core/nouveau_drv.h
index 4184aa5..a51e552 100644
--- a/shared-core/nouveau_drv.h
+++ b/shared-core/nouveau_drv.h
@@ -136,6 +136,7 @@
 	/* NV50 VM */
 	struct nouveau_gpuobj     *vm_pd;
 	struct nouveau_gpuobj_ref *vm_gart_pt;
+	struct nouveau_gpuobj_ref *vm_vram_pt;
 
 	/* Objects */
 	struct nouveau_gpuobj_ref *ramin; /* Private instmem */
@@ -290,6 +291,9 @@
 		unsigned long sg_handle;
 	} gart_info;
 
+	/* G8x global VRAM page table */
+	struct nouveau_gpuobj *vm_vram_pt;
+
 	/* the mtrr covering the FB */
 	int fb_mtrr;
 
diff --git a/shared-core/nouveau_mem.c b/shared-core/nouveau_mem.c
index 4e80ca4..2cf8807 100644
--- a/shared-core/nouveau_mem.c
+++ b/shared-core/nouveau_mem.c
@@ -468,6 +468,11 @@
 	/* Init FB */
 	dev_priv->fb_phys=drm_get_resource_start(dev,1);
 	fb_size = nouveau_mem_fb_amount(dev);
+	/* On G80, limit VRAM to 512MiB temporarily due to limits in how
+	 * we handle VRAM page tables.
+	 */
+	if (dev_priv->card_type >= NV_50 && fb_size > (512 * 1024 * 1024))
+		fb_size = (512 * 1024 * 1024);
 	/* On at least NV40, RAMIN is actually at the end of vram.
 	 * We don't want to allocate this... */
 	if (dev_priv->card_type >= NV_40)
@@ -540,6 +545,21 @@
 		}
 	}
 
+	/* G8x: Allocate shared page table to map real VRAM pages into */
+	if (dev_priv->card_type >= NV_50) {
+		unsigned size = ((512 * 1024 * 1024) / 65536) * 8;
+
+		ret = nouveau_gpuobj_new(dev, NULL, size, 0,
+					 NVOBJ_FLAG_ZERO_ALLOC |
+					 NVOBJ_FLAG_ALLOW_NO_REFS,
+					 &dev_priv->vm_vram_pt);
+		if (ret) {
+			DRM_ERROR("Error creating VRAM page table: %d\n", ret);
+			return ret;
+		}
+	}
+
+
 	return 0;
 }
 
@@ -558,6 +578,12 @@
 	if (alignment < PAGE_SHIFT)
 		alignment = PAGE_SHIFT;
 
+	/* Align allocation sizes to 64KiB blocks on G8x.  We use a 64KiB
+	 * page size in the GPU VM.
+	 */
+	if (flags & NOUVEAU_MEM_FB && dev_priv->card_type >= NV_50)
+		size = (size + (64 * 1024)) & ~((64 * 1024) - 1);
+
 	/*
 	 * Warn about 0 sized allocations, but let it go through. It'll return 1 page
 	 */
@@ -612,6 +638,30 @@
 alloc_ok:
 	block->flags=type;
 
+	/* On G8x, map memory into VM */
+	if (block->flags & NOUVEAU_MEM_FB && dev_priv->card_type >= NV_50 &&
+	    !(flags & NOUVEAU_MEM_NOVM)) {
+		struct nouveau_gpuobj *pt = dev_priv->vm_vram_pt;
+		unsigned offset = block->start;
+		unsigned count = block->size / 65536;
+
+		if (!pt) {
+			DRM_ERROR("vm alloc without vm pt\n");
+			nouveau_mem_free_block(block);
+			return NULL;
+		}
+
+		while (count--) {
+			unsigned pte = offset / 65536;
+
+			INSTANCE_WR(pt, (pte * 2) + 0, offset | 1);
+			INSTANCE_WR(pt, (pte * 2) + 1, 0x00000000);
+			offset += 65536;
+		}
+	} else {
+		block->flags |= NOUVEAU_MEM_NOVM;
+	}	
+
 	if (flags&NOUVEAU_MEM_MAPPED)
 	{
 		struct drm_map_list *entry;
@@ -653,9 +703,34 @@
 
 void nouveau_mem_free(struct drm_device* dev, struct mem_block* block)
 {
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
 	DRM_DEBUG("freeing 0x%llx type=0x%08x\n", block->start, block->flags);
+
 	if (block->flags&NOUVEAU_MEM_MAPPED)
 		drm_rmmap(dev, block->map);
+
+	/* G8x: Remove pages from vm */
+	if (block->flags & NOUVEAU_MEM_FB && dev_priv->card_type >= NV_50 &&
+	    !(block->flags & NOUVEAU_MEM_NOVM)) {
+		struct nouveau_gpuobj *pt = dev_priv->vm_vram_pt;
+		unsigned offset = block->start;
+		unsigned count = block->size / 65536;
+
+		if (!pt) {
+			DRM_ERROR("vm free without vm pt\n");
+			goto out_free;
+		}
+
+		while (count--) {
+			unsigned pte = offset / 65536;
+			INSTANCE_WR(pt, (pte * 2) + 0, 0);
+			INSTANCE_WR(pt, (pte * 2) + 1, 0);
+			offset += 65536;
+		}
+	}
+
+out_free:
 	nouveau_mem_free_block(block);
 }
 
@@ -670,6 +745,9 @@
 
 	NOUVEAU_CHECK_INITIALISED_WITH_RETURN;
 
+	if (alloc->flags & NOUVEAU_MEM_INTERNAL)
+		return -EINVAL;
+
 	block=nouveau_mem_alloc(dev, alloc->alignment, alloc->size,
 				alloc->flags, file_priv);
 	if (!block)
diff --git a/shared-core/nouveau_object.c b/shared-core/nouveau_object.c
index b6bf759..09f9027 100644
--- a/shared-core/nouveau_object.c
+++ b/shared-core/nouveau_object.c
@@ -983,7 +983,11 @@
 			return ret;
 	}
 
-	/* NV50 VM, point offset 0-512MiB at shared PCIEGART table  */
+	/* NV50 VM
+	 *  - Allocate per-channel page-directory
+	 *  - Point offset 0-512MiB at shared PCIEGART table
+	 *  - Point offset 512-1024MiB at shared VRAM table
+	 */
 	if (dev_priv->card_type >= NV_50) {
 		uint32_t vm_offset;
 
@@ -1004,6 +1008,14 @@
 		INSTANCE_WR(chan->vm_pd, (0+0)/4,
 			    chan->vm_gart_pt->instance | 0x03);
 		INSTANCE_WR(chan->vm_pd, (0+4)/4, 0x00000000);
+
+		if ((ret = nouveau_gpuobj_ref_add(dev, NULL, 0,
+						  dev_priv->vm_vram_pt,
+						  &chan->vm_vram_pt)))
+			return ret;
+		INSTANCE_WR(chan->vm_pd, (8+0)/4,
+			    chan->vm_vram_pt->instance | 0x61);
+		INSTANCE_WR(chan->vm_pd, (8+4)/4, 0x00000000);
 	}
 
 	/* RAMHT */
@@ -1022,6 +1034,17 @@
 	}
 
 	/* VRAM ctxdma */
+	if (dev_priv->card_type >= NV_50) {
+		ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
+					     512*1024*1024,
+					     dev_priv->fb_available_size,
+					     NV_DMA_ACCESS_RW,
+					     NV_DMA_TARGET_AGP, &vram);
+		if (ret) {
+			DRM_ERROR("Error creating VRAM ctxdma: %d\n", ret);
+			return ret;
+		}
+	} else
 	if ((ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
 					  0, dev_priv->fb_available_size,
 					  NV_DMA_ACCESS_RW,
@@ -1084,6 +1107,7 @@
 
 	nouveau_gpuobj_del(dev, &chan->vm_pd);
 	nouveau_gpuobj_ref_del(dev, &chan->vm_gart_pt);
+	nouveau_gpuobj_ref_del(dev, &chan->vm_vram_pt);
 
 	if (chan->ramin_heap)
 		nouveau_mem_takedown(&chan->ramin_heap);
diff --git a/shared-core/nouveau_state.c b/shared-core/nouveau_state.c
index 1216216..5ed16d7 100644
--- a/shared-core/nouveau_state.c
+++ b/shared-core/nouveau_state.c
@@ -384,6 +384,7 @@
 		nouveau_sgdma_takedown(dev);
 
 		nouveau_gpuobj_takedown(dev);
+		nouveau_gpuobj_del(dev, &dev_priv->vm_vram_pt);
 
 		nouveau_mem_close(dev);
 		engine->instmem.takedown(dev);
diff --git a/shared-core/nv50_instmem.c b/shared-core/nv50_instmem.c
index 9687ecb..b7a51f0 100644
--- a/shared-core/nv50_instmem.c
+++ b/shared-core/nv50_instmem.c
@@ -243,7 +243,8 @@
 		return -EINVAL;
 
 	gpuobj->im_backing = nouveau_mem_alloc(dev, NV50_INSTMEM_PAGE_SIZE,
-					       *sz, NOUVEAU_MEM_FB,
+					       *sz, NOUVEAU_MEM_FB |
+					       NOUVEAU_MEM_NOVM,
 					       (struct drm_file *)-2);
 	if (!gpuobj->im_backing) {
 		DRM_ERROR("Couldn't allocate vram to back PRAMIN pages\n");