Merge "msm: kgsl: Enhance GFT to avoid hang->recover->hang cycle"
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index e2ea262..d7f8508 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -413,7 +413,9 @@
#define KGSL_FT_SKIPFRAME 3
#define KGSL_FT_DISABLE 4
#define KGSL_FT_TEMP_DISABLE 5
-#define KGSL_FT_DEFAULT_POLICY (BIT(KGSL_FT_REPLAY) + BIT(KGSL_FT_SKIPIB))
+#define KGSL_FT_THROTTLE 6
+#define KGSL_FT_DEFAULT_POLICY (BIT(KGSL_FT_REPLAY) + BIT(KGSL_FT_SKIPIB) \
+ + BIT(KGSL_FT_THROTTLE))
/* This internal bit is used to skip the PM dump on replayed command batches */
#define KGSL_FT_SKIP_PMDUMP 31
@@ -431,7 +433,8 @@
{ BIT(KGSL_FT_SKIPIB), "skipib" }, \
{ BIT(KGSL_FT_SKIPFRAME), "skipframe" }, \
{ BIT(KGSL_FT_DISABLE), "disable" }, \
- { BIT(KGSL_FT_TEMP_DISABLE), "temp" }
+ { BIT(KGSL_FT_TEMP_DISABLE), "temp" }, \
+ { BIT(KGSL_FT_THROTTLE), "throttle"}
extern struct adreno_gpudev adreno_a2xx_gpudev;
extern struct adreno_gpudev adreno_a3xx_gpudev;
diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c
index 48d0210..443939a 100644
--- a/drivers/gpu/msm/adreno_dispatch.c
+++ b/drivers/gpu/msm/adreno_dispatch.c
@@ -33,6 +33,15 @@
/* Number of command batches sent at a time from a single context */
static unsigned int _context_cmdbatch_burst = 5;
+/*
+ * GFT throttle parameters. If GFT recovered more than
+ * X times in Y ms invalidate the context and do not attempt recovery.
+ * X -> _fault_throttle_burst
+ * Y -> _fault_throttle_time
+ */
+static unsigned int _fault_throttle_time = 3000;
+static unsigned int _fault_throttle_burst = 3;
+
/* Number of command batches inflight in the ringbuffer at any time */
static unsigned int _dispatcher_inflight = 15;
@@ -1015,6 +1024,35 @@
cmdbatch = replay[0];
/*
+ * If GFT recovered more than X times in Y ms invalidate the context
+ * and do not attempt recovery.
+ * Example: X==3 and Y==3000 ms, GPU hung at 500ms, 1700ms, 25000ms and
+ * 3000ms for the same context, we will not try FT and invalidate the
+ * context @3000ms because context triggered GFT more than 3 times in
+ * last 3 seconds. If a context caused recoverable GPU hangs
+ * where 1st and 4th gpu hang are more than 3 seconds apart we
+ * won't disable GFT and invalidate the context.
+ */
+ if (test_bit(KGSL_FT_THROTTLE, &cmdbatch->fault_policy)) {
+ if (time_after(jiffies, (cmdbatch->context->fault_time
+ + msecs_to_jiffies(_fault_throttle_time)))) {
+ cmdbatch->context->fault_time = jiffies;
+ cmdbatch->context->fault_count = 1;
+ } else {
+ cmdbatch->context->fault_count++;
+ if (cmdbatch->context->fault_count >
+ _fault_throttle_burst) {
+ set_bit(KGSL_FT_DISABLE,
+ &cmdbatch->fault_policy);
+ pr_fault(device, cmdbatch,
+ "gpu fault threshold exceeded %d faults in %d msecs\n",
+ _fault_throttle_burst,
+ _fault_throttle_time);
+ }
+ }
+ }
+
+ /*
* If FT is disabled for this cmdbatch invalidate immediately
*/
@@ -1628,6 +1666,10 @@
static DISPATCHER_UINT_ATTR(context_queue_wait, 0644, 0, _context_queue_wait);
static DISPATCHER_UINT_ATTR(fault_detect_interval, 0644, 0,
_fault_timer_interval);
+static DISPATCHER_UINT_ATTR(fault_throttle_time, 0644, 0,
+ _fault_throttle_time);
+static DISPATCHER_UINT_ATTR(fault_throttle_burst, 0644, 0,
+ _fault_throttle_burst);
static struct attribute *dispatcher_attrs[] = {
&dispatcher_attr_inflight.attr,
@@ -1636,6 +1678,8 @@
&dispatcher_attr_cmdbatch_timeout.attr,
&dispatcher_attr_context_queue_wait.attr,
&dispatcher_attr_fault_detect_interval.attr,
+ &dispatcher_attr_fault_throttle_time.attr,
+ &dispatcher_attr_fault_throttle_burst.attr,
NULL,
};
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 98fd731..fc4b77e 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -349,6 +349,8 @@
* is set.
* @flags: flags from userspace controlling the behavior of this context
* @pwr_constraint: power constraint from userspace for this context
+ * @fault_count: number of times gpu hanged in last _context_throttle_time ms
+ * @fault_time: time of the first gpu hang in last _context_throttle_time ms
*/
struct kgsl_context {
struct kref refcount;
@@ -367,6 +369,8 @@
unsigned int pagefault_ts;
unsigned int flags;
struct kgsl_pwr_constraint pwr_constraint;
+ unsigned int fault_count;
+ unsigned long fault_time;
};
/**