drm/i915/hangcheck: Prevent long walks across full-ppgtt
With full-ppgtt, it takes the GPU an eon to traverse the entire 256PiB
address space, causing a loop to be detected. Under the current scheme,
if ACTHD walks off the end of a batch buffer and into an empty
address space, we "never" detect the hang. If we always increment the
score as the ACTHD is progressing then we will eventually timeout (after
~46.5s (31 * 1.5s) without advancing onto a new batch). To counter act
this, increase the amount we reduce the score for good batches, so that
only a series of almost-bad batches trigger a full reset. DoS detection
suffers slightly but series of long running shader tests will benefit.
Based on a patch from Chris Wilson.
Testcase: igt/drv_hangman/hangcheck-unterminated
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1456930109-21532-1-git-send-email-mika.kuoppala@intel.com
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a0f1bd7..15aacd0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1367,8 +1367,6 @@
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)ring->hangcheck.acthd,
(long long)acthd[i]);
- seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
- (long long)ring->hangcheck.max_acthd);
seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 3b6bfbf..13b5f3a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -230,8 +230,6 @@
return "wait";
case HANGCHECK_ACTIVE:
return "active";
- case HANGCHECK_ACTIVE_LOOP:
- return "active (loop)";
case HANGCHECK_KICK:
return "kick";
case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d1a46ef..53e5104 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -3001,12 +3001,7 @@
memset(ring->hangcheck.instdone, 0,
sizeof(ring->hangcheck.instdone));
- if (acthd > ring->hangcheck.max_acthd) {
- ring->hangcheck.max_acthd = acthd;
- return HANGCHECK_ACTIVE;
- }
-
- return HANGCHECK_ACTIVE_LOOP;
+ return HANGCHECK_ACTIVE;
}
if (!subunits_stuck(ring))
@@ -3083,6 +3078,7 @@
#define BUSY 1
#define KICK 5
#define HUNG 20
+#define ACTIVE_DECAY 15
if (!i915.enable_hangcheck)
return;
@@ -3151,9 +3147,8 @@
switch (ring->hangcheck.action) {
case HANGCHECK_IDLE:
case HANGCHECK_WAIT:
- case HANGCHECK_ACTIVE:
break;
- case HANGCHECK_ACTIVE_LOOP:
+ case HANGCHECK_ACTIVE:
ring->hangcheck.score += BUSY;
break;
case HANGCHECK_KICK:
@@ -3172,10 +3167,12 @@
* attempts across multiple batches.
*/
if (ring->hangcheck.score > 0)
- ring->hangcheck.score--;
+ ring->hangcheck.score -= ACTIVE_DECAY;
+ if (ring->hangcheck.score < 0)
+ ring->hangcheck.score = 0;
/* Clear head and subunit states on seqno movement */
- ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+ ring->hangcheck.acthd = 0;
memset(ring->hangcheck.instdone, 0,
sizeof(ring->hangcheck.instdone));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dd910d3..4b1439d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -79,7 +79,6 @@
HANGCHECK_IDLE = 0,
HANGCHECK_WAIT,
HANGCHECK_ACTIVE,
- HANGCHECK_ACTIVE_LOOP,
HANGCHECK_KICK,
HANGCHECK_HUNG,
};
@@ -88,7 +87,6 @@
struct intel_ring_hangcheck {
u64 acthd;
- u64 max_acthd;
u32 seqno;
int score;
enum intel_ring_hangcheck_action action;