drm/ttm/radeon/nouveau: Kill the bo lock in favour of a bo device fence_lock

The bo lock used only to protect the bo sync object members, and since it
is a per bo lock, fencing a buffer list will see a lot of locks and unlocks.
Replace it with a per-device lock that protects the sync object members on
*all* bos. Reading and setting these members will always be very quick, so
the risc of heavy lock contention is microscopic. Note that waiting for
sync objects will always take place outside of this lock.

The bo device fence lock will eventually be replaced with a seqlock /
rcu mechanism so we can determine that a bo is idle under a
rcu / read seqlock.

However this change will allow us to batch fencing and unreserving of
buffers with a minimal amount of locking.

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Jerome Glisse <j.glisse@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 9a1fdcf..1f2301d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -234,10 +234,10 @@
 		if (likely(fence)) {
 			struct nouveau_fence *prev_fence;
 
-			spin_lock(&nvbo->bo.lock);
+			spin_lock(&nvbo->bo.bdev->fence_lock);
 			prev_fence = nvbo->bo.sync_obj;
 			nvbo->bo.sync_obj = nouveau_fence_ref(fence);
-			spin_unlock(&nvbo->bo.lock);
+			spin_unlock(&nvbo->bo.bdev->fence_lock);
 			nouveau_fence_unref((void *)&prev_fence);
 		}
 
@@ -557,9 +557,9 @@
 				data |= r->vor;
 		}
 
-		spin_lock(&nvbo->bo.lock);
+		spin_lock(&nvbo->bo.bdev->fence_lock);
 		ret = ttm_bo_wait(&nvbo->bo, false, false, false);
-		spin_unlock(&nvbo->bo.lock);
+		spin_unlock(&nvbo->bo.bdev->fence_lock);
 		if (ret) {
 			NV_ERROR(dev, "reloc wait_idle failed: %d\n", ret);
 			break;
@@ -791,9 +791,9 @@
 	}
 
 	if (req->flags & NOUVEAU_GEM_CPU_PREP_NOBLOCK) {
-		spin_lock(&nvbo->bo.lock);
+		spin_lock(&nvbo->bo.bdev->fence_lock);
 		ret = ttm_bo_wait(&nvbo->bo, false, false, no_wait);
-		spin_unlock(&nvbo->bo.lock);
+		spin_unlock(&nvbo->bo.bdev->fence_lock);
 	} else {
 		ret = ttm_bo_synccpu_write_grab(&nvbo->bo, no_wait);
 		if (ret == 0)
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index 1d06774..e939cb6 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -369,11 +369,11 @@
 
 	list_for_each_entry(lobj, head, list) {
 		bo = lobj->bo;
-		spin_lock(&bo->tbo.lock);
+		spin_lock(&bo->tbo.bdev->fence_lock);
 		old_fence = (struct radeon_fence *)bo->tbo.sync_obj;
 		bo->tbo.sync_obj = radeon_fence_ref(fence);
 		bo->tbo.sync_obj_arg = NULL;
-		spin_unlock(&bo->tbo.lock);
+		spin_unlock(&bo->tbo.bdev->fence_lock);
 		if (old_fence) {
 			radeon_fence_unref(&old_fence);
 		}
diff --git a/drivers/gpu/drm/radeon/radeon_object.h b/drivers/gpu/drm/radeon/radeon_object.h
index d143702..fd53675 100644
--- a/drivers/gpu/drm/radeon/radeon_object.h
+++ b/drivers/gpu/drm/radeon/radeon_object.h
@@ -126,12 +126,12 @@
 	r = ttm_bo_reserve(&bo->tbo, true, no_wait, false, 0);
 	if (unlikely(r != 0))
 		return r;
-	spin_lock(&bo->tbo.lock);
+	spin_lock(&bo->tbo.bdev->fence_lock);
 	if (mem_type)
 		*mem_type = bo->tbo.mem.mem_type;
 	if (bo->tbo.sync_obj)
 		r = ttm_bo_wait(&bo->tbo, true, true, no_wait);
-	spin_unlock(&bo->tbo.lock);
+	spin_unlock(&bo->tbo.bdev->fence_lock);
 	ttm_bo_unreserve(&bo->tbo);
 	return r;
 }
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 5d87508..d93c73b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -427,11 +427,9 @@
 	}
 
 	if (bo->mem.mm_node) {
-		spin_lock(&bo->lock);
 		bo->offset = (bo->mem.start << PAGE_SHIFT) +
 		    bdev->man[bo->mem.mem_type].gpu_offset;
 		bo->cur_placement = bo->mem.placement;
-		spin_unlock(&bo->lock);
 	} else
 		bo->offset = 0;
 
@@ -485,14 +483,14 @@
 	int put_count;
 	int ret;
 
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	(void) ttm_bo_wait(bo, false, false, true);
 	if (!bo->sync_obj) {
 
 		spin_lock(&glob->lru_lock);
 
 		/**
-		 * Lock inversion between bo::reserve and bo::lock here,
+		 * Lock inversion between bo:reserve and bdev::fence_lock here,
 		 * but that's OK, since we're only trylocking.
 		 */
 
@@ -501,7 +499,7 @@
 		if (unlikely(ret == -EBUSY))
 			goto queue;
 
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		put_count = ttm_bo_del_from_lru(bo);
 
 		spin_unlock(&glob->lru_lock);
@@ -522,7 +520,7 @@
 	kref_get(&bo->list_kref);
 	list_add_tail(&bo->ddestroy, &bdev->ddestroy);
 	spin_unlock(&glob->lru_lock);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bdev->fence_lock);
 
 	if (sync_obj) {
 		driver->sync_obj_flush(sync_obj, sync_obj_arg);
@@ -547,14 +545,15 @@
 			       bool no_wait_reserve,
 			       bool no_wait_gpu)
 {
+	struct ttm_bo_device *bdev = bo->bdev;
 	struct ttm_bo_global *glob = bo->glob;
 	int put_count;
 	int ret = 0;
 
 retry:
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0))
 		return ret;
@@ -707,9 +706,9 @@
 	struct ttm_placement placement;
 	int ret = 0;
 
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bdev->fence_lock);
 
 	if (unlikely(ret != 0)) {
 		if (ret != -ERESTARTSYS) {
@@ -1044,6 +1043,7 @@
 {
 	int ret = 0;
 	struct ttm_mem_reg mem;
+	struct ttm_bo_device *bdev = bo->bdev;
 
 	BUG_ON(!atomic_read(&bo->reserved));
 
@@ -1052,9 +1052,9 @@
 	 * Have the driver move function wait for idle when necessary,
 	 * instead of doing it here.
 	 */
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, interruptible, no_wait_gpu);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bdev->fence_lock);
 	if (ret)
 		return ret;
 	mem.num_pages = bo->num_pages;
@@ -1171,7 +1171,6 @@
 	}
 	bo->destroy = destroy;
 
-	spin_lock_init(&bo->lock);
 	kref_init(&bo->kref);
 	kref_init(&bo->list_kref);
 	atomic_set(&bo->cpu_writers, 0);
@@ -1535,7 +1534,7 @@
 	bdev->dev_mapping = NULL;
 	bdev->glob = glob;
 	bdev->need_dma32 = need_dma32;
-
+	spin_lock_init(&bdev->fence_lock);
 	mutex_lock(&glob->device_list_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&glob->device_list_mutex);
@@ -1659,6 +1658,7 @@
 		bool lazy, bool interruptible, bool no_wait)
 {
 	struct ttm_bo_driver *driver = bo->bdev->driver;
+	struct ttm_bo_device *bdev = bo->bdev;
 	void *sync_obj;
 	void *sync_obj_arg;
 	int ret = 0;
@@ -1672,9 +1672,9 @@
 			void *tmp_obj = bo->sync_obj;
 			bo->sync_obj = NULL;
 			clear_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-			spin_unlock(&bo->lock);
+			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bo->lock);
+			spin_lock(&bdev->fence_lock);
 			continue;
 		}
 
@@ -1683,29 +1683,29 @@
 
 		sync_obj = driver->sync_obj_ref(bo->sync_obj);
 		sync_obj_arg = bo->sync_obj_arg;
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		ret = driver->sync_obj_wait(sync_obj, sync_obj_arg,
 					    lazy, interruptible);
 		if (unlikely(ret != 0)) {
 			driver->sync_obj_unref(&sync_obj);
-			spin_lock(&bo->lock);
+			spin_lock(&bdev->fence_lock);
 			return ret;
 		}
-		spin_lock(&bo->lock);
+		spin_lock(&bdev->fence_lock);
 		if (likely(bo->sync_obj == sync_obj &&
 			   bo->sync_obj_arg == sync_obj_arg)) {
 			void *tmp_obj = bo->sync_obj;
 			bo->sync_obj = NULL;
 			clear_bit(TTM_BO_PRIV_FLAG_MOVING,
 				  &bo->priv_flags);
-			spin_unlock(&bo->lock);
+			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
 			driver->sync_obj_unref(&tmp_obj);
-			spin_lock(&bo->lock);
+			spin_lock(&bdev->fence_lock);
 		} else {
-			spin_unlock(&bo->lock);
+			spin_unlock(&bdev->fence_lock);
 			driver->sync_obj_unref(&sync_obj);
-			spin_lock(&bo->lock);
+			spin_lock(&bdev->fence_lock);
 		}
 	}
 	return 0;
@@ -1714,6 +1714,7 @@
 
 int ttm_bo_synccpu_write_grab(struct ttm_buffer_object *bo, bool no_wait)
 {
+	struct ttm_bo_device *bdev = bo->bdev;
 	int ret = 0;
 
 	/*
@@ -1723,9 +1724,9 @@
 	ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
 	if (unlikely(ret != 0))
 		return ret;
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, true, no_wait);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bdev->fence_lock);
 	if (likely(ret == 0))
 		atomic_inc(&bo->cpu_writers);
 	ttm_bo_unreserve(bo);
@@ -1797,9 +1798,9 @@
 	 * Wait for GPU, then move to system cached.
 	 */
 
-	spin_lock(&bo->lock);
+	spin_lock(&bo->bdev->fence_lock);
 	ret = ttm_bo_wait(bo, false, false, false);
-	spin_unlock(&bo->lock);
+	spin_unlock(&bo->bdev->fence_lock);
 
 	if (unlikely(ret != 0))
 		goto out;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 3106d5b..4b75133 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -337,7 +337,6 @@
 	 * TODO: Explicit member copy would probably be better here.
 	 */
 
-	spin_lock_init(&fbo->lock);
 	init_waitqueue_head(&fbo->event_queue);
 	INIT_LIST_HEAD(&fbo->ddestroy);
 	INIT_LIST_HEAD(&fbo->lru);
@@ -520,7 +519,7 @@
 	struct ttm_buffer_object *ghost_obj;
 	void *tmp_obj = NULL;
 
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	if (bo->sync_obj) {
 		tmp_obj = bo->sync_obj;
 		bo->sync_obj = NULL;
@@ -529,7 +528,7 @@
 	bo->sync_obj_arg = sync_obj_arg;
 	if (evict) {
 		ret = ttm_bo_wait(bo, false, false, false);
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
 		if (ret)
@@ -552,7 +551,7 @@
 		 */
 
 		set_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags);
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		if (tmp_obj)
 			driver->sync_obj_unref(&tmp_obj);
 
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index fe6cb77..8dd446c 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -118,17 +118,17 @@
 	 * move.
 	 */
 
-	spin_lock(&bo->lock);
+	spin_lock(&bdev->fence_lock);
 	if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
 		ret = ttm_bo_wait(bo, false, true, false);
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		if (unlikely(ret != 0)) {
 			retval = (ret != -ERESTARTSYS) ?
 			    VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
 			goto out_unlock;
 		}
 	} else
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 
 
 	ret = ttm_mem_io_reserve(bdev, &bo->mem);
diff --git a/drivers/gpu/drm/ttm/ttm_execbuf_util.c b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
index 7dcc647..c3a2100 100644
--- a/drivers/gpu/drm/ttm/ttm_execbuf_util.c
+++ b/drivers/gpu/drm/ttm/ttm_execbuf_util.c
@@ -203,14 +203,15 @@
 
 	list_for_each_entry(entry, list, head) {
 		struct ttm_buffer_object *bo = entry->bo;
-		struct ttm_bo_driver *driver = bo->bdev->driver;
+		struct ttm_bo_device *bdev = bo->bdev;
+		struct ttm_bo_driver *driver = bdev->driver;
 		void *old_sync_obj;
 
-		spin_lock(&bo->lock);
+		spin_lock(&bdev->fence_lock);
 		old_sync_obj = bo->sync_obj;
 		bo->sync_obj = driver->sync_obj_ref(sync_obj);
 		bo->sync_obj_arg = entry->new_sync_obj_arg;
-		spin_unlock(&bo->lock);
+		spin_unlock(&bdev->fence_lock);
 		ttm_bo_unreserve(bo);
 		entry->reserved = false;
 		if (old_sync_obj)