drm/vc4: Allow using more than 256MB of CMA memory.

Until now, we've had to limit Raspberry Pi to 256MB of CMA memory to
keep from triggering the hardware addressing bug between the tile
binner and the tile alloc memory (where the top 4 bits come from the
tile state data array's address).

To work around that and allow more memory to be reserved for graphics,
allocate a single BO to store tile state data arrays and tile
alloc/overflow memory while the GPU is active, and make sure that that
one BO doesn't happen to cross a 256MB boundary.  With that in place,
we can allocate textures and shaders anywhere in system memory (still
contiguous, of course).

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: http://patchwork.freedesktop.org/patch/msgid/20170327231025.19391-1-eric@anholt.net
Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 81d2bc0..b0967e2 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -99,12 +99,23 @@
 	 */
 	struct list_head seqno_cb_list;
 
-	/* The binner overflow memory that's currently set up in
-	 * BPOA/BPOS registers.  When overflow occurs and a new one is
-	 * allocated, the previous one will be moved to
-	 * vc4->current_exec's free list.
+	/* The memory used for storing binner tile alloc, tile state,
+	 * and overflow memory allocations.  This is freed when V3D
+	 * powers down.
 	 */
-	struct vc4_bo *overflow_mem;
+	struct vc4_bo *bin_bo;
+
+	/* Size of blocks allocated within bin_bo. */
+	uint32_t bin_alloc_size;
+
+	/* Bitmask of the bin_alloc_size chunks in bin_bo that are
+	 * used.
+	 */
+	uint32_t bin_alloc_used;
+
+	/* Bitmask of the current bin_alloc used for overflow memory. */
+	uint32_t bin_alloc_overflow;
+
 	struct work_struct overflow_mem_work;
 
 	int power_refcount;
@@ -316,8 +327,12 @@
 	bool found_increment_semaphore_packet;
 	bool found_flush;
 	uint8_t bin_tiles_x, bin_tiles_y;
-	struct drm_gem_cma_object *tile_bo;
+	/* Physical address of the start of the tile alloc array
+	 * (where each tile's binned CL will start)
+	 */
 	uint32_t tile_alloc_offset;
+	/* Bitmask of which binner slots are freed when this job completes. */
+	uint32_t bin_slots;
 
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
@@ -552,6 +567,7 @@
 extern struct platform_driver vc4_v3d_driver;
 int vc4_v3d_debugfs_ident(struct seq_file *m, void *unused);
 int vc4_v3d_debugfs_regs(struct seq_file *m, void *unused);
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4);
 
 /* vc4_validate.c */
 int
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 12fb70e..735412e 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -820,6 +820,7 @@
 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	unsigned long irqflags;
 	unsigned i;
 
 	/* If we got force-completed because of GPU reset rather than
@@ -841,6 +842,11 @@
 		drm_gem_object_unreference_unlocked(&bo->base.base);
 	}
 
+	/* Free up the allocation of any bin slots we used. */
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	vc4->bin_alloc_used &= ~exec->bin_slots;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
 	mutex_lock(&vc4->power_lock);
 	if (--vc4->power_refcount == 0) {
 		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1101,9 +1107,9 @@
 	/* V3D should already have disabled its interrupt and cleared
 	 * the overflow allocation registers.  Now free the object.
 	 */
-	if (vc4->overflow_mem) {
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-		vc4->overflow_mem = NULL;
+	if (vc4->bin_bo) {
+		drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+		vc4->bin_bo = NULL;
 	}
 
 	if (vc4->hang_state)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index 1384af9..7d7af3a 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -59,50 +59,45 @@
 {
 	struct vc4_dev *vc4 =
 		container_of(work, struct vc4_dev, overflow_mem_work);
-	struct drm_device *dev = vc4->dev;
-	struct vc4_bo *bo;
+	struct vc4_bo *bo = vc4->bin_bo;
+	int bin_bo_slot;
+	struct vc4_exec_info *exec;
+	unsigned long irqflags;
 
-	bo = vc4_bo_create(dev, 256 * 1024, true);
-	if (IS_ERR(bo)) {
+	bin_bo_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_bo_slot < 0) {
 		DRM_ERROR("Couldn't allocate binner overflow mem\n");
 		return;
 	}
 
-	/* If there's a job executing currently, then our previous
-	 * overflow allocation is getting used in that job and we need
-	 * to queue it to be released when the job is done.  But if no
-	 * job is executing at all, then we can free the old overflow
-	 * object direcctly.
-	 *
-	 * No lock necessary for this pointer since we're the only
-	 * ones that update the pointer, and our workqueue won't
-	 * reenter.
-	 */
-	if (vc4->overflow_mem) {
-		struct vc4_exec_info *current_exec;
-		unsigned long irqflags;
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
 
-		spin_lock_irqsave(&vc4->job_lock, irqflags);
-		current_exec = vc4_first_bin_job(vc4);
-		if (!current_exec)
-			current_exec = vc4_last_render_job(vc4);
-		if (current_exec) {
-			vc4->overflow_mem->seqno = current_exec->seqno;
-			list_add_tail(&vc4->overflow_mem->unref_head,
-				      &current_exec->unref_list);
-			vc4->overflow_mem = NULL;
+	if (vc4->bin_alloc_overflow) {
+		/* If we had overflow memory allocated previously,
+		 * then that chunk will free when the current bin job
+		 * is done.  If we don't have a bin job running, then
+		 * the chunk will be done whenever the list of render
+		 * jobs has drained.
+		 */
+		exec = vc4_first_bin_job(vc4);
+		if (!exec)
+			exec = vc4_last_render_job(vc4);
+		if (exec) {
+			exec->bin_slots |= vc4->bin_alloc_overflow;
+		} else {
+			/* There's nothing queued in the hardware, so
+			 * the old slot is free immediately.
+			 */
+			vc4->bin_alloc_used &= ~vc4->bin_alloc_overflow;
 		}
-		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 	}
+	vc4->bin_alloc_overflow = BIT(bin_bo_slot);
 
-	if (vc4->overflow_mem)
-		drm_gem_object_unreference_unlocked(&vc4->overflow_mem->base.base);
-	vc4->overflow_mem = bo;
-
-	V3D_WRITE(V3D_BPOA, bo->base.paddr);
+	V3D_WRITE(V3D_BPOA, bo->base.paddr + bin_bo_slot * vc4->bin_alloc_size);
 	V3D_WRITE(V3D_BPOS, bo->base.base.size);
 	V3D_WRITE(V3D_INTCTL, V3D_INT_OUTOMEM);
 	V3D_WRITE(V3D_INTENA, V3D_INT_OUTOMEM);
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 }
 
 static void
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 4339471..5dc1942 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -182,8 +182,7 @@
 
 	if (has_bin) {
 		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
-		rcl_u32(setup, (exec->tile_bo->paddr +
-				exec->tile_alloc_offset +
+		rcl_u32(setup, (exec->tile_alloc_offset +
 				(y * exec->bin_tiles_x + x) * 32));
 	}
 
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 7cc346a..a88078d 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -156,6 +156,144 @@
 	V3D_WRITE(V3D_VPMBASE, 0);
 }
 
+int vc4_v3d_get_bin_slot(struct vc4_dev *vc4)
+{
+	struct drm_device *dev = vc4->dev;
+	unsigned long irqflags;
+	int slot;
+	uint64_t seqno = 0;
+	struct vc4_exec_info *exec;
+
+try_again:
+	spin_lock_irqsave(&vc4->job_lock, irqflags);
+	slot = ffs(~vc4->bin_alloc_used);
+	if (slot != 0) {
+		/* Switch from ffs() bit index to a 0-based index. */
+		slot--;
+		vc4->bin_alloc_used |= BIT(slot);
+		spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+		return slot;
+	}
+
+	/* Couldn't find an open slot.  Wait for render to complete
+	 * and try again.
+	 */
+	exec = vc4_last_render_job(vc4);
+	if (exec)
+		seqno = exec->seqno;
+	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
+
+	if (seqno) {
+		int ret = vc4_wait_for_seqno(dev, seqno, ~0ull, true);
+
+		if (ret == 0)
+			goto try_again;
+
+		return ret;
+	}
+
+	return -ENOMEM;
+}
+
+/**
+ * vc4_allocate_bin_bo() - allocates the memory that will be used for
+ * tile binning.
+ *
+ * The binner has a limitation that the addresses in the tile state
+ * buffer that point into the tile alloc buffer or binner overflow
+ * memory only have 28 bits (256MB), and the top 4 on the bus for
+ * tile alloc references end up coming from the tile state buffer's
+ * address.
+ *
+ * To work around this, we allocate a single large buffer while V3D is
+ * in use, make sure that it has the top 4 bits constant across its
+ * entire extent, and then put the tile state, tile alloc, and binner
+ * overflow memory inside that buffer.
+ *
+ * This creates a limitation where we may not be able to execute a job
+ * if it doesn't fit within the buffer that we allocated up front.
+ * However, it turns out that 16MB is "enough for anybody", and
+ * real-world applications run into allocation failures from the
+ * overall CMA pool before they make scenes complicated enough to run
+ * out of bin space.
+ */
+int
+vc4_allocate_bin_bo(struct drm_device *drm)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(drm);
+	struct vc4_v3d *v3d = vc4->v3d;
+	uint32_t size = 16 * 1024 * 1024;
+	int ret = 0;
+	struct list_head list;
+
+	/* We may need to try allocating more than once to get a BO
+	 * that doesn't cross 256MB.  Track the ones we've allocated
+	 * that failed so far, so that we can free them when we've got
+	 * one that succeeded (if we freed them right away, our next
+	 * allocation would probably be the same chunk of memory).
+	 */
+	INIT_LIST_HEAD(&list);
+
+	while (true) {
+		struct vc4_bo *bo = vc4_bo_create(drm, size, true);
+
+		if (IS_ERR(bo)) {
+			ret = PTR_ERR(bo);
+
+			dev_err(&v3d->pdev->dev,
+				"Failed to allocate memory for tile binning: "
+				"%d. You may need to enable CMA or give it "
+				"more memory.",
+				ret);
+			break;
+		}
+
+		/* Check if this BO won't trigger the addressing bug. */
+		if ((bo->base.paddr & 0xf0000000) ==
+		    ((bo->base.paddr + bo->base.base.size - 1) & 0xf0000000)) {
+			vc4->bin_bo = bo;
+
+			/* Set up for allocating 512KB chunks of
+			 * binner memory.  The biggest allocation we
+			 * need to do is for the initial tile alloc +
+			 * tile state buffer.  We can render to a
+			 * maximum of ((2048*2048) / (32*32) = 4096
+			 * tiles in a frame (until we do floating
+			 * point rendering, at which point it would be
+			 * 8192).  Tile state is 48b/tile (rounded to
+			 * a page), and tile alloc is 32b/tile
+			 * (rounded to a page), plus a page of extra,
+			 * for a total of 320kb for our worst-case.
+			 * We choose 512kb so that it divides evenly
+			 * into our 16MB, and the rest of the 512kb
+			 * will be used as storage for the overflow
+			 * from the initial 32b CL per bin.
+			 */
+			vc4->bin_alloc_size = 512 * 1024;
+			vc4->bin_alloc_used = 0;
+			vc4->bin_alloc_overflow = 0;
+			WARN_ON_ONCE(sizeof(vc4->bin_alloc_used) * 8 !=
+				     bo->base.base.size / vc4->bin_alloc_size);
+
+			break;
+		}
+
+		/* Put it on the list to free later, and try again. */
+		list_add(&bo->unref_head, &list);
+	}
+
+	/* Free all the BOs we allocated but didn't choose. */
+	while (!list_empty(&list)) {
+		struct vc4_bo *bo = list_last_entry(&list,
+						    struct vc4_bo, unref_head);
+
+		list_del(&bo->unref_head);
+		drm_gem_object_put_unlocked(&bo->base.base);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_PM
 static int vc4_v3d_runtime_suspend(struct device *dev)
 {
@@ -164,6 +302,9 @@
 
 	vc4_irq_uninstall(vc4->dev);
 
+	drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
+	vc4->bin_bo = NULL;
+
 	return 0;
 }
 
@@ -171,6 +312,11 @@
 {
 	struct vc4_v3d *v3d = dev_get_drvdata(dev);
 	struct vc4_dev *vc4 = v3d->vc4;
+	int ret;
+
+	ret = vc4_allocate_bin_bo(vc4->dev);
+	if (ret)
+		return ret;
 
 	vc4_v3d_init_hw(vc4->dev);
 	vc4_irq_postinstall(vc4->dev);
@@ -208,6 +354,10 @@
 		return -EINVAL;
 	}
 
+	ret = vc4_allocate_bin_bo(drm);
+	if (ret)
+		return ret;
+
 	/* Reset the binner overflow address/size at setup, to be sure
 	 * we don't reuse an old one.
 	 */
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c
index da6f1e1..3de8f11 100644
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -348,10 +348,11 @@
 validate_tile_binning_config(VALIDATE_ARGS)
 {
 	struct drm_device *dev = exec->exec_bo->base.dev;
-	struct vc4_bo *tile_bo;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
 	uint8_t flags;
-	uint32_t tile_state_size, tile_alloc_size;
-	uint32_t tile_count;
+	uint32_t tile_state_size;
+	uint32_t tile_count, bin_addr;
+	int bin_slot;
 
 	if (exec->found_tile_binning_mode_config_packet) {
 		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -377,13 +378,28 @@
 		return -EINVAL;
 	}
 
+	bin_slot = vc4_v3d_get_bin_slot(vc4);
+	if (bin_slot < 0) {
+		if (bin_slot != -EINTR && bin_slot != -ERESTARTSYS) {
+			DRM_ERROR("Failed to allocate binner memory: %d\n",
+				  bin_slot);
+		}
+		return bin_slot;
+	}
+
+	/* The slot we allocated will only be used by this job, and is
+	 * free when the job completes rendering.
+	 */
+	exec->bin_slots |= BIT(bin_slot);
+	bin_addr = vc4->bin_bo->base.paddr + bin_slot * vc4->bin_alloc_size;
+
 	/* The tile state data array is 48 bytes per tile, and we put it at
 	 * the start of a BO containing both it and the tile alloc.
 	 */
 	tile_state_size = 48 * tile_count;
 
 	/* Since the tile alloc array will follow us, align. */
-	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+	exec->tile_alloc_offset = bin_addr + roundup(tile_state_size, 4096);
 
 	*(uint8_t *)(validated + 14) =
 		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
@@ -394,35 +410,13 @@
 		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
 			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
 
-	/* Initial block size. */
-	tile_alloc_size = 32 * tile_count;
-
-	/*
-	 * The initial allocation gets rounded to the next 256 bytes before
-	 * the hardware starts fulfilling further allocations.
-	 */
-	tile_alloc_size = roundup(tile_alloc_size, 256);
-
-	/* Add space for the extra allocations.  This is what gets used first,
-	 * before overflow memory.  It must have at least 4096 bytes, but we
-	 * want to avoid overflow memory usage if possible.
-	 */
-	tile_alloc_size += 1024 * 1024;
-
-	tile_bo = vc4_bo_create(dev, exec->tile_alloc_offset + tile_alloc_size,
-				true);
-	exec->tile_bo = &tile_bo->base;
-	if (IS_ERR(exec->tile_bo))
-		return PTR_ERR(exec->tile_bo);
-	list_add_tail(&tile_bo->unref_head, &exec->unref_list);
-
 	/* tile alloc address. */
-	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
-					exec->tile_alloc_offset);
+	*(uint32_t *)(validated + 0) = exec->tile_alloc_offset;
 	/* tile alloc size. */
-	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	*(uint32_t *)(validated + 4) = (bin_addr + vc4->bin_alloc_size -
+					exec->tile_alloc_offset);
 	/* tile state address. */
-	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
+	*(uint32_t *)(validated + 8) = bin_addr;
 
 	return 0;
 }