drm/i915: Tighten reset_counter for reset status In the reset_counter, we use two bits to track a GPU hang and reset. The low bit is a "reset-in-progress" flag that we set to signal when we need to break waiters in order for the recovery task to grab the mutex. As soon as the recovery task has the mutex, we can clear that flag (which we do by incrementing the reset_counter thereby incrementing the gobal reset epoch). By clearing that flag when the recovery task holds the struct_mutex, we can forgo a second flag that simply tells GEM to ignore the "reset-in-progress" flag. The second flag we store in the reset_counter is whether the reset failed and we consider the GPU terminally wedged. Whilst this flag is set, all access to the GPU (at least through GEM rather than direct mmio access) is verboten. PS: Fun is in store, as in the future we want to move from a global reset epoch to a per-engine reset engine with request recovery. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Link: http://patchwork.freedesktop.org/patch/msgid/1460565315-7748-6-git-send-email-chris@chris-wilson.co.uk

commit: d98c52cf4fa2bb7116a89f1132fc773b1cfa6436 [log] [tgz]
author: Chris Wilson <chris@chris-wilson.co.uk> Wed Apr 13 17:35:05 2016 +0100
committer: Chris Wilson <chris@chris-wilson.co.uk> Thu Apr 14 10:45:40 2016 +0100
tree: 94e588b0ef5dcdbc41748fc37a49de31b4f8641d
parent: 7f1847ebf48b2e5753691cc9e2a404c260383933 [diff] [blame]
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1dca344..633d0dd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c

@@ -880,23 +880,32 @@
 int i915_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	bool simulated;
+	struct i915_gpu_error *error = &dev_priv->gpu_error;
+	unsigned reset_counter;
 	int ret;
 
 	intel_reset_gt_powersave(dev);
 
 	mutex_lock(&dev->struct_mutex);
 
-	i915_gem_reset(dev);
+	/* Clear any previous failed attempts at recovery. Time to try again. */
+	atomic_andnot(I915_WEDGED, &error->reset_counter);
 
-	simulated = dev_priv->gpu_error.stop_rings != 0;
+	/* Clear the reset-in-progress flag and increment the reset epoch. */
+	reset_counter = atomic_inc_return(&error->reset_counter);
+	if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
+		ret = -EIO;
+		goto error;
+	}
+
+	i915_gem_reset(dev);
 
 	ret = intel_gpu_reset(dev, ALL_ENGINES);
 
 	/* Also reset the gpu hangman. */
-	if (simulated) {
+	if (error->stop_rings != 0) {
 		DRM_INFO("Simulated gpu hang, resetting stop_rings\n");
-		dev_priv->gpu_error.stop_rings = 0;
+		error->stop_rings = 0;
 		if (ret == -ENODEV) {
 			DRM_INFO("Reset not implemented, but ignoring "
 				 "error for simulated gpu hangs\n");
@@ -909,8 +918,7 @@
 
 	if (ret) {
 		DRM_ERROR("Failed to reset chip: %i\n", ret);
-		mutex_unlock(&dev->struct_mutex);
-		return ret;
+		goto error;
 	}
 
 	intel_overlay_reset(dev_priv);
@@ -929,20 +937,14 @@
 	 * was running at the time of the reset (i.e. we weren't VT
 	 * switched away).
 	 */
-
-	/* Used to prevent gem_check_wedged returning -EAGAIN during gpu reset */
-	dev_priv->gpu_error.reload_in_reset = true;
-
 	ret = i915_gem_init_hw(dev);
-
-	dev_priv->gpu_error.reload_in_reset = false;
-
-	mutex_unlock(&dev->struct_mutex);
 	if (ret) {
 		DRM_ERROR("Failed hw init on reset %d\n", ret);
-		return ret;
+		goto error;
 	}
 
+	mutex_unlock(&dev->struct_mutex);
+
 	/*
 	 * rps/rc6 re-init is necessary to restore state lost after the
 	 * reset and the re-install of gt irqs. Skip for ironlake per
@@ -953,6 +955,11 @@
 		intel_enable_gt_powersave(dev);
 
 	return 0;
+
+error:
+	atomic_or(I915_WEDGED, &error->reset_counter);
+	mutex_unlock(&dev->struct_mutex);
+	return ret;
 }
 
 static int i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
commit	d98c52cf4fa2bb7116a89f1132fc773b1cfa6436	[log] [tgz]
author	Chris Wilson <chris@chris-wilson.co.uk>	Wed Apr 13 17:35:05 2016 +0100
committer	Chris Wilson <chris@chris-wilson.co.uk>	Thu Apr 14 10:45:40 2016 +0100
tree	94e588b0ef5dcdbc41748fc37a49de31b4f8641d
parent	7f1847ebf48b2e5753691cc9e2a404c260383933 [diff] [blame]