drm/vc4: Add a BO cache.

We need to allocate new BOs in the kernel as part of each frame, but
the CMA allocator is way too slow for that.  As an optimization, keep
track of recently-freed BOs and reuse them, with a 1 second timeout to
fully free them back to the system.

This improves 3D performance by about 15%.

Signed-off-by: Eric Anholt <eric@anholt.net>
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index fd8319f..39a1ff5 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -17,6 +17,37 @@
 	struct vc4_crtc *crtc[3];
 
 	struct drm_fbdev_cma *fbdev;
+
+	/* The kernel-space BO cache.  Tracks buffers that have been
+	 * unreferenced by all other users (refcounts of 0!) but not
+	 * yet freed, so we can do cheap allocations.
+	 */
+	struct vc4_bo_cache {
+		/* Array of list heads for entries in the BO cache,
+		 * based on number of pages, so we can do O(1) lookups
+		 * in the cache when allocating.
+		 */
+		struct list_head *size_list;
+		uint32_t size_list_size;
+
+		/* List of all BOs in the cache, ordered by age, so we
+		 * can do O(1) lookups when trying to free old
+		 * buffers.
+		 */
+		struct list_head time_list;
+		struct work_struct time_work;
+		struct timer_list time_timer;
+	} bo_cache;
+
+	struct vc4_bo_stats {
+		u32 num_allocated;
+		u32 size_allocated;
+		u32 num_cached;
+		u32 size_cached;
+	} bo_stats;
+
+	/* Protects bo_cache and the BO stats. */
+	struct mutex bo_lock;
 };
 
 static inline struct vc4_dev *
@@ -27,6 +58,17 @@
 
 struct vc4_bo {
 	struct drm_gem_cma_object base;
+
+	/* List entry for the BO's position in either
+	 * vc4_exec_info->unref_list or vc4_dev->bo_cache.time_list
+	 */
+	struct list_head unref_head;
+
+	/* Time in jiffies when the BO was put in vc4->bo_cache. */
+	unsigned long free_time;
+
+	/* List entry for the BO's position in vc4_dev->bo_cache.size_list */
+	struct list_head size_head;
 };
 
 static inline struct vc4_bo *
@@ -104,13 +146,18 @@
 #define wait_for(COND, MS) _wait_for(COND, MS, 1)
 
 /* vc4_bo.c */
+struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size);
 void vc4_free_object(struct drm_gem_object *gem_obj);
-struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size);
+struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t size,
+			     bool from_cache);
 int vc4_dumb_create(struct drm_file *file_priv,
 		    struct drm_device *dev,
 		    struct drm_mode_create_dumb *args);
 struct dma_buf *vc4_prime_export(struct drm_device *dev,
 				 struct drm_gem_object *obj, int flags);
+void vc4_bo_cache_init(struct drm_device *dev);
+void vc4_bo_cache_destroy(struct drm_device *dev);
+int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
 
 /* vc4_crtc.c */
 extern struct platform_driver vc4_crtc_driver;