drm/i915: Don't count semaphore waits towards a stuck ring
If we detect a ring is in a valid wait for another, just let it be.
Eventually it will either begin to progress again, or the entire system
will come grinding to a halt and then hangcheck will fire as soon as the
deadlock is detected.
This error was foretold by Ben in
commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date: Thu May 30 09:04:29 2013 +0300
drm/i915: detect hang using per ring hangcheck_score
"If ring B is waiting on ring A via semaphore, and ring A is making
progress, albeit slowly - the hangcheck will fire. The check will
determine that A is moving, however ring B will appear hung because
the ACTHD doesn't move. I honestly can't say if that's actually a
realistic problem to hit it probably implies the timeout value is too
low."
v2: Make sure we don't even incur the KICK cost whilst waiting.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6a757691..563abe7 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2321,21 +2321,21 @@
i915_seqno_passed(seqno, ring_last_seqno(ring)));
}
-static bool semaphore_passed(struct intel_ring_buffer *ring)
+static struct intel_ring_buffer *
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
{
struct drm_i915_private *dev_priv = ring->dev->dev_private;
- u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
- struct intel_ring_buffer *signaller;
- u32 cmd, ipehr, acthd_min;
+ u32 cmd, ipehr, acthd, acthd_min;
ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
if ((ipehr & ~(0x3 << 16)) !=
(MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
- return false;
+ return NULL;
/* ACTHD is likely pointing to the dword after the actual command,
* so scan backwards until we find the MBOX.
*/
+ acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
acthd_min = max((int)acthd - 3 * 4, 0);
do {
cmd = ioread32(ring->virtual_start + acthd);
@@ -2344,22 +2344,53 @@
acthd -= 4;
if (acthd < acthd_min)
- return false;
+ return NULL;
} while (1);
- signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
- return i915_seqno_passed(signaller->get_seqno(signaller, false),
- ioread32(ring->virtual_start+acthd+4)+1);
+ *seqno = ioread32(ring->virtual_start+acthd+4)+1;
+ return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
}
-static bool ring_hung(struct intel_ring_buffer *ring)
+static int semaphore_passed(struct intel_ring_buffer *ring)
+{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
+ struct intel_ring_buffer *signaller;
+ u32 seqno, ctl;
+
+ ring->hangcheck.deadlock = true;
+
+ signaller = semaphore_waits_for(ring, &seqno);
+ if (signaller == NULL || signaller->hangcheck.deadlock)
+ return -1;
+
+ /* cursory check for an unkickable deadlock */
+ ctl = I915_READ_CTL(signaller);
+ if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
+ return -1;
+
+ return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
+}
+
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+{
+ struct intel_ring_buffer *ring;
+ int i;
+
+ for_each_ring(ring, dev_priv, i)
+ ring->hangcheck.deadlock = false;
+}
+
+static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
{
struct drm_device *dev = ring->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
u32 tmp;
+ if (ring->hangcheck.acthd != acthd)
+ return active;
+
if (IS_GEN2(dev))
- return true;
+ return hung;
/* Is the chip hanging on a WAIT_FOR_EVENT?
* If so we can simply poke the RB_WAIT bit
@@ -2371,19 +2402,24 @@
DRM_ERROR("Kicking stuck wait on %s\n",
ring->name);
I915_WRITE_CTL(ring, tmp);
- return false;
+ return kick;
}
- if (INTEL_INFO(dev)->gen >= 6 &&
- tmp & RING_WAIT_SEMAPHORE &&
- semaphore_passed(ring)) {
- DRM_ERROR("Kicking stuck semaphore on %s\n",
- ring->name);
- I915_WRITE_CTL(ring, tmp);
- return false;
+ if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
+ switch (semaphore_passed(ring)) {
+ default:
+ return hung;
+ case 1:
+ DRM_ERROR("Kicking stuck semaphore on %s\n",
+ ring->name);
+ I915_WRITE_CTL(ring, tmp);
+ return kick;
+ case 0:
+ return wait;
+ }
}
- return true;
+ return hung;
}
/**
@@ -2414,6 +2450,8 @@
u32 seqno, acthd;
bool busy = true;
+ semaphore_clear_deadlocks(dev_priv);
+
seqno = ring->get_seqno(ring, false);
acthd = intel_ring_get_active_head(ring);
@@ -2430,17 +2468,36 @@
} else {
int score;
- stuck[i] = ring->hangcheck.acthd == acthd;
- if (stuck[i]) {
- /* Every time we kick the ring, add a
- * small increment to the hangcheck
- * score so that we can catch a
- * batch that is repeatedly kicked.
- */
- score = ring_hung(ring) ? HUNG : KICK;
- } else
+ /* We always increment the hangcheck score
+ * if the ring is busy and still processing
+ * the same request, so that no single request
+ * can run indefinitely (such as a chain of
+ * batches). The only time we do not increment
+ * the hangcheck score on this ring, if this
+ * ring is in a legitimate wait for another
+ * ring. In that case the waiting ring is a
+ * victim and we want to be sure we catch the
+ * right culprit. Then every time we do kick
+ * the ring, add a small increment to the
+ * score so that we can catch a batch that is
+ * being repeatedly kicked and so responsible
+ * for stalling the machine.
+ */
+ switch (ring_stuck(ring, acthd)) {
+ case wait:
+ score = 0;
+ break;
+ case active:
score = BUSY;
-
+ break;
+ case kick:
+ score = KICK;
+ break;
+ case hung:
+ score = HUNG;
+ stuck[i] = true;
+ break;
+ }
ring->hangcheck.score += score;
}
} else {