drm/i915: Convert hangcheck from a timer into a delayed work item When run as a timer, i915_hangcheck_elapsed() must adhere to all the rules of running in a softirq context. This is advantageous to us as we want to minimise the risk that a driver bug will prevent us from detecting a hung GPU. However, that is irrelevant if the driver bug prevents us from resetting and recovering. Still it is prudent not to rely on mutexes inside the checker, but given the coarseness of dev->struct_mutex doing so is extremely hard. Give in and run from a work queue, i.e. outside of softirq. v2: Use own workqueue to avoid deadlocks (Daniel) Cleanup commit msg and add comment to i915_queue_hangcheck() (Chris) Cc: Jani Nikula <jani.nikula@intel.com> Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1) Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> [danvet: Remove accidental kerneldoc comment starter, to appease the 0 day builder.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

commit: 737b1506037788f1e01b2a4c5795d4180b2e2e00 [log] [tgz]
author: Chris Wilson <chris@chris-wilson.co.uk> Mon Jan 26 18:03:03 2015 +0200
committer: Daniel Vetter <daniel.vetter@ffwll.ch> Wed Jan 28 17:22:12 2015 +0100
tree: 6c8c2fd71c7726e7725535c3e935edf1369e01f6
parent: 983d308cb8f602d1920a8c40196eb2ab6cc07bd2 [diff]
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 51e8fe5..6eaf795 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c

@@ -790,6 +790,14 @@
 		goto out_freewq;
 	}
 
+	dev_priv->gpu_error.hangcheck_wq =
+		alloc_ordered_workqueue("i915-hangcheck", 0);
+	if (dev_priv->gpu_error.hangcheck_wq == NULL) {
+		DRM_ERROR("Failed to create our hangcheck workqueue.\n");
+		ret = -ENOMEM;
+		goto out_freedpwq;
+	}
+
 	intel_irq_init(dev_priv);
 	intel_uncore_sanitize(dev);
 
@@ -864,6 +872,8 @@
 	intel_teardown_gmbus(dev);
 	intel_teardown_mchbar(dev);
 	pm_qos_remove_request(&dev_priv->pm_qos);
+	destroy_workqueue(dev_priv->gpu_error.hangcheck_wq);
+out_freedpwq:
 	destroy_workqueue(dev_priv->dp_wq);
 out_freewq:
 	destroy_workqueue(dev_priv->wq);
@@ -934,7 +944,7 @@
 	}
 
 	/* Free error state after interrupts are fully disabled. */
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_work_sync(&dev_priv->gpu_error.work);
 	i915_destroy_error_state(dev);
 
@@ -960,6 +970,7 @@
 
 	destroy_workqueue(dev_priv->dp_wq);
 	destroy_workqueue(dev_priv->wq);
+	destroy_workqueue(dev_priv->gpu_error.hangcheck_wq);
 	pm_qos_remove_request(&dev_priv->pm_qos);
 
 	i915_global_gtt_cleanup(dev);

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9da4e60..5f50e70 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c

@@ -1402,7 +1402,7 @@
 		return ret;
 	}
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	intel_uncore_forcewake_reset(dev, false);
 	dev_priv->pm.suspended = true;
 

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 760d239..7aee7d5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h

@@ -1345,7 +1345,8 @@
 	/* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
-	struct timer_list hangcheck_timer;
+	struct workqueue_struct *hangcheck_wq;
+	struct delayed_work hangcheck_work;
 
 	/* For reset and error_state handling. */
 	spinlock_t lock;

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9c7c95a..361d18b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c

@@ -4608,7 +4608,7 @@
 	i915_gem_stop_ringbuffers(dev);
 	mutex_unlock(&dev->struct_mutex);
 
-	del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+	cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
 	flush_delayed_work(&dev_priv->mm.idle_work);
 

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 2399eae..23bfe22 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c

@@ -2974,7 +2974,7 @@
 	return HANGCHECK_HUNG;
 }
 
-/**
+/*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
  * if there are no progress, hangcheck score for that ring is increased.
@@ -2982,10 +2982,12 @@
  * we kick the ring. If we see no progress on three subsequent calls
  * we assume chip is wedged and try to fix it by resetting the chip.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
+static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_device *dev = (struct drm_device *)data;
-	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv),
+			     gpu_error.hangcheck_work.work);
+	struct drm_device *dev = dev_priv->dev;
 	struct intel_engine_cs *ring;
 	int i;
 	int busy_count = 0, rings_hung = 0;
@@ -3099,17 +3101,18 @@
 
 void i915_queue_hangcheck(struct drm_device *dev)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct timer_list *timer = &dev_priv->gpu_error.hangcheck_timer;
+	struct i915_gpu_error *e = &to_i915(dev)->gpu_error;
 
 	if (!i915.enable_hangcheck)
 		return;
 
-	/* Don't continually defer the hangcheck, but make sure it is active */
-	if (timer_pending(timer))
-		return;
-	mod_timer(timer,
-		  round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+	/* Don't continually defer the hangcheck so that it is always run at
+	 * least once after work has been scheduled on any ring. Otherwise,
+	 * we will ignore a hung ring if a second ring is kept busy.
+	 */
+
+	queue_delayed_work(e->hangcheck_wq, &e->hangcheck_work,
+			   round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
 }
 
 static void ibx_irq_reset(struct drm_device *dev)
@@ -4353,9 +4356,8 @@
 	else
 		dev_priv->pm_rps_events = GEN6_PM_RPS_EVENTS;
 
-	setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-		    i915_hangcheck_elapsed,
-		    (unsigned long) dev);
+	INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
+			  i915_hangcheck_elapsed);
 	INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
 			  intel_hpd_irq_reenable_work);
commit	737b1506037788f1e01b2a4c5795d4180b2e2e00	[log] [tgz]
author	Chris Wilson <chris@chris-wilson.co.uk>	Mon Jan 26 18:03:03 2015 +0200
committer	Daniel Vetter <daniel.vetter@ffwll.ch>	Wed Jan 28 17:22:12 2015 +0100
tree	6c8c2fd71c7726e7725535c3e935edf1369e01f6
parent	983d308cb8f602d1920a8c40196eb2ab6cc07bd2 [diff]