drm/radeon: convert fence to uint64_t v4

This convert fence to use uint64_t sequence number intention is
to use the fact that uin64_t is big enough that we don't need to
care about wrap around.

Tested with and without writeback using 0xFFFFF000 as initial
fence sequence and thus allowing to test the wrap around from
32bits to 64bits.

v2: Add comment about possible race btw CPU & GPU, add comment
    stressing that we need 2 dword aligned for R600_WB_EVENT_OFFSET
    Read fence sequenc in reverse order of GPU write them so we
    mitigate the race btw CPU and GPU.

v3: Drop the need for ring to emit the 64bits fence, and just have
    each ring emit the lower 32bits of the fence sequence. We
    handle the wrap over 32bits in fence_process.

v4: Just a small optimization: Don't reread the last_seq value
    if loop restarts, since we already know its value anyway.
    Also start at zero not one for seq value and use pre instead
    of post increment in emmit, otherwise wait_empty will deadlock.

Signed-off-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Christian König <deathsimple@vodafone.de>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 5bb78bf..feb2bbc 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -66,14 +66,14 @@
 	unsigned long irq_flags;
 
 	write_lock_irqsave(&rdev->fence_lock, irq_flags);
-	if (fence->emitted) {
+	if (fence->seq && fence->seq < RADEON_FENCE_NOTEMITED_SEQ) {
 		write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 		return 0;
 	}
-	fence->seq = atomic_add_return(1, &rdev->fence_drv[fence->ring].seq);
+	/* we are protected by the ring emission mutex */
+	fence->seq = ++rdev->fence_drv[fence->ring].seq;
 	radeon_fence_ring_emit(rdev, fence->ring, fence);
 	trace_radeon_fence_emit(rdev->ddev, fence->seq);
-	fence->emitted = true;
 	/* are we the first fence on a previusly idle ring? */
 	if (list_empty(&rdev->fence_drv[fence->ring].emitted)) {
 		rdev->fence_drv[fence->ring].last_activity = jiffies;
@@ -87,14 +87,60 @@
 {
 	struct radeon_fence *fence;
 	struct list_head *i, *n;
-	uint32_t seq;
+	uint64_t seq, last_seq;
+	unsigned count_loop = 0;
 	bool wake = false;
 
-	seq = radeon_fence_read(rdev, ring);
-	if (seq == rdev->fence_drv[ring].last_seq)
-		return false;
+	/* Note there is a scenario here for an infinite loop but it's
+	 * very unlikely to happen. For it to happen, the current polling
+	 * process need to be interrupted by another process and another
+	 * process needs to update the last_seq btw the atomic read and
+	 * xchg of the current process.
+	 *
+	 * More over for this to go in infinite loop there need to be
+	 * continuously new fence signaled ie radeon_fence_read needs
+	 * to return a different value each time for both the currently
+	 * polling process and the other process that xchg the last_seq
+	 * btw atomic read and xchg of the current process. And the
+	 * value the other process set as last seq must be higher than
+	 * the seq value we just read. Which means that current process
+	 * need to be interrupted after radeon_fence_read and before
+	 * atomic xchg.
+	 *
+	 * To be even more safe we count the number of time we loop and
+	 * we bail after 10 loop just accepting the fact that we might
+	 * have temporarly set the last_seq not to the true real last
+	 * seq but to an older one.
+	 */
+	last_seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
+	do {
+		seq = radeon_fence_read(rdev, ring);
+		seq |= last_seq & 0xffffffff00000000LL;
+		if (seq < last_seq) {
+			seq += 0x100000000LL;
+		}
 
-	rdev->fence_drv[ring].last_seq = seq;
+		if (!wake && seq == last_seq) {
+			return false;
+		}
+		/* If we loop over we don't want to return without
+		 * checking if a fence is signaled as it means that the
+		 * seq we just read is different from the previous on.
+		 */
+		wake = true;
+		if ((count_loop++) > 10) {
+			/* We looped over too many time leave with the
+			 * fact that we might have set an older fence
+			 * seq then the current real last seq as signaled
+			 * by the hw.
+			 */
+			break;
+		}
+		last_seq = seq;
+	} while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
+
+	/* reset wake to false */
+	wake = false;
 	rdev->fence_drv[ring].last_activity = jiffies;
 
 	n = NULL;
@@ -112,7 +158,7 @@
 			n = i->prev;
 			list_move_tail(i, &rdev->fence_drv[ring].signaled);
 			fence = list_entry(i, struct radeon_fence, list);
-			fence->signaled = true;
+			fence->seq = RADEON_FENCE_SIGNALED_SEQ;
 			i = n;
 		} while (i != &rdev->fence_drv[ring].emitted);
 		wake = true;
@@ -128,7 +174,7 @@
 	fence = container_of(kref, struct radeon_fence, kref);
 	write_lock_irqsave(&fence->rdev->fence_lock, irq_flags);
 	list_del(&fence->list);
-	fence->emitted = false;
+	fence->seq = RADEON_FENCE_NOTEMITED_SEQ;
 	write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags);
 	if (fence->semaphore)
 		radeon_semaphore_free(fence->rdev, fence->semaphore);
@@ -145,9 +191,7 @@
 	}
 	kref_init(&((*fence)->kref));
 	(*fence)->rdev = rdev;
-	(*fence)->emitted = false;
-	(*fence)->signaled = false;
-	(*fence)->seq = 0;
+	(*fence)->seq = RADEON_FENCE_NOTEMITED_SEQ;
 	(*fence)->ring = ring;
 	(*fence)->semaphore = NULL;
 	INIT_LIST_HEAD(&(*fence)->list);
@@ -163,18 +207,18 @@
 		return true;
 
 	write_lock_irqsave(&fence->rdev->fence_lock, irq_flags);
-	signaled = fence->signaled;
+	signaled = (fence->seq == RADEON_FENCE_SIGNALED_SEQ);
 	/* if we are shuting down report all fence as signaled */
 	if (fence->rdev->shutdown) {
 		signaled = true;
 	}
-	if (!fence->emitted) {
+	if (fence->seq == RADEON_FENCE_NOTEMITED_SEQ) {
 		WARN(1, "Querying an unemitted fence : %p !\n", fence);
 		signaled = true;
 	}
 	if (!signaled) {
 		radeon_fence_poll_locked(fence->rdev, fence->ring);
-		signaled = fence->signaled;
+		signaled = (fence->seq == RADEON_FENCE_SIGNALED_SEQ);
 	}
 	write_unlock_irqrestore(&fence->rdev->fence_lock, irq_flags);
 	return signaled;
@@ -183,8 +227,8 @@
 int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 {
 	struct radeon_device *rdev;
-	unsigned long irq_flags, timeout;
-	u32 seq;
+	unsigned long irq_flags, timeout, last_activity;
+	uint64_t seq;
 	int i, r;
 	bool signaled;
 
@@ -207,7 +251,9 @@
 			timeout = 1;
 		}
 		/* save current sequence value used to check for GPU lockups */
-		seq = rdev->fence_drv[fence->ring].last_seq;
+		seq = atomic64_read(&rdev->fence_drv[fence->ring].last_seq);
+		/* Save current last activity valuee, used to check for GPU lockups */
+		last_activity = rdev->fence_drv[fence->ring].last_activity;
 		read_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 
 		trace_radeon_fence_wait_begin(rdev->ddev, seq);
@@ -235,24 +281,23 @@
 			}
 
 			write_lock_irqsave(&rdev->fence_lock, irq_flags);
-			/* check if sequence value has changed since last_activity */
-			if (seq != rdev->fence_drv[fence->ring].last_seq) {
+			/* test if somebody else has already decided that this is a lockup */
+			if (last_activity != rdev->fence_drv[fence->ring].last_activity) {
 				write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 				continue;
 			}
 
-			/* change sequence value on all rings, so nobody else things there is a lockup */
-			for (i = 0; i < RADEON_NUM_RINGS; ++i)
-				rdev->fence_drv[i].last_seq -= 0x10000;
-
-			rdev->fence_drv[fence->ring].last_activity = jiffies;
 			write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 
 			if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
-
 				/* good news we believe it's a lockup */
-				printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
-				     fence->seq, seq);
+				dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence id 0x%016llx)\n",
+					 fence->seq, seq);
+
+				/* change last activity so nobody else think there is a lockup */
+				for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+					rdev->fence_drv[i].last_activity = jiffies;
+				}
 
 				/* mark the ring as not ready any more */
 				rdev->ring[fence->ring].ready = false;
@@ -387,9 +432,9 @@
 	}
 	rdev->fence_drv[ring].cpu_addr = &rdev->wb.wb[index/4];
 	rdev->fence_drv[ring].gpu_addr = rdev->wb.gpu_addr + index;
-	radeon_fence_write(rdev, atomic_read(&rdev->fence_drv[ring].seq), ring);
+	radeon_fence_write(rdev, rdev->fence_drv[ring].seq, ring);
 	rdev->fence_drv[ring].initialized = true;
-	DRM_INFO("fence driver on ring %d use gpu addr 0x%08Lx and cpu addr 0x%p\n",
+	DRM_INFO("fence driver on ring %d use gpu addr 0x%016llx and cpu addr 0x%p\n",
 		 ring, rdev->fence_drv[ring].gpu_addr, rdev->fence_drv[ring].cpu_addr);
 	write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
 	return 0;
@@ -400,7 +445,8 @@
 	rdev->fence_drv[ring].scratch_reg = -1;
 	rdev->fence_drv[ring].cpu_addr = NULL;
 	rdev->fence_drv[ring].gpu_addr = 0;
-	atomic_set(&rdev->fence_drv[ring].seq, 0);
+	rdev->fence_drv[ring].seq = 0;
+	atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
 	INIT_LIST_HEAD(&rdev->fence_drv[ring].emitted);
 	INIT_LIST_HEAD(&rdev->fence_drv[ring].signaled);
 	init_waitqueue_head(&rdev->fence_drv[ring].queue);
@@ -458,12 +504,12 @@
 			continue;
 
 		seq_printf(m, "--- ring %d ---\n", i);
-		seq_printf(m, "Last signaled fence 0x%08X\n",
-			   radeon_fence_read(rdev, i));
+		seq_printf(m, "Last signaled fence 0x%016lx\n",
+			   atomic64_read(&rdev->fence_drv[i].last_seq));
 		if (!list_empty(&rdev->fence_drv[i].emitted)) {
 			fence = list_entry(rdev->fence_drv[i].emitted.prev,
 					   struct radeon_fence, list);
-			seq_printf(m, "Last emitted fence %p with 0x%08X\n",
+			seq_printf(m, "Last emitted fence %p with 0x%016llx\n",
 				   fence,  fence->seq);
 		}
 	}