drm/i915: Allow disabling error capture
We currently capture the GPU state after we detect a hang. This is vital
for us to both triage and debug hangs in the wild (post-mortem
debugging). However, it comes at the cost of running some potentially
dangerous code (since it has to make very few assumption about the state
of the driver) that is quite resource intensive.
This patch introduces both a method to disable error capture at runtime
(for users who hit bugs at runtime and need a workaround) and to disable
error capture at compiletime (for realtime users who want to minimise
any possible latency, and never require error capture, saving ~30k of
code). The cost is that we now have to be wary of (and test!) a kconfig
flag and a module parameter. The effect of the module parameter is easy
to verify through code inspection and runtime testing, but a kconfig flag
needs regular compile checking.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@linux.intel.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch
Link: http://patchwork.freedesktop.org/patch/msgid/20161012090522.367-2-chris@chris-wilson.co.uk
diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e46..8844b99 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -46,6 +46,19 @@
If in doubt, say "N".
+config DRM_I915_CAPTURE_ERROR
+ bool "Enable capturing GPU state following a hang"
+ depends on DRM_I915
+ default y
+ help
+ This option enables capturing the GPU state when a hang is detected.
+ This information is vital for triaging hangs and assists in debugging.
+ Please report any hang to
+ https://bugs.freedesktop.org/enter_bug.cgi?product=DRI
+ for triaging.
+
+ If in doubt, say "Y".
+
config DRM_I915_USERPTR
bool "Always enable userptr support"
depends on DRM_I915
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a998c2b..8790ae4 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -42,7 +42,6 @@
i915_gem_stolen.o \
i915_gem_tiling.o \
i915_gem_userptr.o \
- i915_gpu_error.o \
i915_trace_points.o \
intel_breadcrumbs.o \
intel_engine_cs.o \
@@ -107,6 +106,9 @@
intel_sdvo.o \
intel_tv.o
+# Post-mortem debug and GPU hang state capture
+i915-$(CONFIG_DRM_I915_CAPTURE_ERROR) += i915_gpu_error.o
+
# virtual gpu code
i915-y += i915_vgpu.o
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f6762e0..358663e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -960,6 +960,8 @@
return 0;
}
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
static ssize_t
i915_error_state_write(struct file *filp,
const char __user *ubuf,
@@ -1042,6 +1044,8 @@
.release = i915_error_state_release,
};
+#endif
+
static int
i915_next_seqno_get(void *data, u64 *val)
{
@@ -5398,7 +5402,9 @@
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
{"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops},
+#endif
{"i915_next_seqno", &i915_next_seqno_fops},
{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
{"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4553a53..380590b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3544,6 +3544,8 @@
#endif
/* i915_gpu_error.c */
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
@@ -3564,6 +3566,20 @@
void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
void i915_destroy_error_state(struct drm_device *dev);
+#else
+
+static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
+ u32 engine_mask,
+ const char *error_msg)
+{
+}
+
+static inline void i915_destroy_error_state(struct drm_device *dev)
+{
+}
+
+#endif
+
const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
/* i915_cmd_parser.c */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 04205c8..c88c0d1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1464,6 +1464,9 @@
struct drm_i915_error_state *error;
unsigned long flags;
+ if (!i915.error_capture)
+ return;
+
if (READ_ONCE(dev_priv->gpu_error.first_error))
return;
diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
index 768ad89..629e433 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -47,6 +47,7 @@
.load_detect_test = 0,
.force_reset_modeset_test = 0,
.reset = true,
+ .error_capture = true,
.invert_brightness = 0,
.disable_display = 0,
.enable_cmd_parser = 1,
@@ -115,6 +116,14 @@
module_param_named_unsafe(reset, i915.reset, bool, 0600);
MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+module_param_named(error_capture, i915.error_capture, bool, 0600);
+MODULE_PARM_DESC(error_capture,
+ "Record the GPU state following a hang. "
+ "This information in /sys/class/drm/card<N>/error is vital for "
+ "triaging and debugging hangs.");
+#endif
+
module_param_named_unsafe(enable_hangcheck, i915.enable_hangcheck, bool, 0644);
MODULE_PARM_DESC(enable_hangcheck,
"Periodically check GPU activity for detecting hangs. "
diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
index 3a0dd78..94efc89 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -59,6 +59,7 @@
bool load_detect_test;
bool force_reset_modeset_test;
bool reset;
+ bool error_capture;
bool disable_display;
bool verbose_state_checks;
bool nuclear_pageflip;
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 1012eee..47590ab 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -514,6 +514,8 @@
NULL,
};
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf,
loff_t off, size_t count)
@@ -571,6 +573,21 @@
.write = error_state_write,
};
+static void i915_setup_error_capture(struct device *kdev)
+{
+ if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr))
+ DRM_ERROR("error_state sysfs setup failed\n");
+}
+
+static void i915_teardown_error_capture(struct device *kdev)
+{
+ sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+}
+#else
+static void i915_setup_error_capture(struct device *kdev) {}
+static void i915_teardown_error_capture(struct device *kdev) {}
+#endif
+
void i915_setup_sysfs(struct drm_i915_private *dev_priv)
{
struct device *kdev = dev_priv->drm.primary->kdev;
@@ -617,17 +634,15 @@
if (ret)
DRM_ERROR("RPS sysfs setup failed\n");
- ret = sysfs_create_bin_file(&kdev->kobj,
- &error_state_attr);
- if (ret)
- DRM_ERROR("error_state sysfs setup failed\n");
+ i915_setup_error_capture(kdev);
}
void i915_teardown_sysfs(struct drm_i915_private *dev_priv)
{
struct device *kdev = dev_priv->drm.primary->kdev;
- sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+ i915_teardown_error_capture(kdev);
+
if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
sysfs_remove_files(&kdev->kobj, vlv_attrs);
else
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index e4bdd3a..cfcb03f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -17097,6 +17097,8 @@
return 0;
}
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
struct intel_display_error_state {
u32 power_well_driver;
@@ -17279,3 +17281,5 @@
err_printf(m, " VSYNC: %08x\n", error->transcoder[i].vsync);
}
}
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c..8c411bf 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1470,6 +1470,8 @@
kfree(dev_priv->overlay);
}
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
struct intel_overlay_error_state {
struct overlay_registers regs;
unsigned long base;
@@ -1587,3 +1589,5 @@
P(UVSCALEV);
#undef P
}
+
+#endif