Merge "msm: kgsl: GPU fault tolerance timer"
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 5ad2394..b917248 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -123,7 +123,7 @@
#define LONG_IB_DETECT_REG_INDEX_START 1
#define LONG_IB_DETECT_REG_INDEX_END 5
-unsigned int ft_detect_regs[] = {
+unsigned int ft_detect_regs[FT_DETECT_REGS_COUNT] = {
A3XX_RBBM_STATUS,
REG_CP_RB_RPTR, /* LONG_IB_DETECT_REG_INDEX_START */
REG_CP_IB1_BASE,
@@ -138,8 +138,6 @@
0
};
-const unsigned int ft_detect_regs_count = ARRAY_SIZE(ft_detect_regs);
-
/*
* This is the master list of all GPU cores that are supported by this
* driver.
@@ -544,6 +542,8 @@
/* Reset the time-out in our idle timer */
mod_timer_pending(&device->idle_timer,
jiffies + device->pwrctrl.interval_timeout);
+ mod_timer_pending(&device->hang_timer,
+ (jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART)));
return result;
}
@@ -1799,6 +1799,9 @@
if (KGSL_STATE_DUMP_AND_FT != device->state)
mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
+ mod_timer(&device->hang_timer,
+ (jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART)));
+
adreno_perfcounter_start(adreno_dev);
device->reset_counter++;
@@ -1834,6 +1837,7 @@
device->ftbl->irqctrl(device, 0);
kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
del_timer_sync(&device->idle_timer);
+ del_timer_sync(&device->hang_timer);
adreno_ocmem_gmem_free(adreno_dev);
@@ -2714,6 +2718,9 @@
} else {
kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
mod_timer(&device->idle_timer, jiffies + FIRST_TIMEOUT);
+ mod_timer(&device->hang_timer,
+ (jiffies +
+ msecs_to_jiffies(KGSL_TIMEOUT_PART)));
}
complete_all(&device->ft_gate);
}
@@ -2905,7 +2912,7 @@
unsigned int rbbm_status;
unsigned long wait_time;
unsigned long wait_time_part;
- unsigned int prev_reg_val[ft_detect_regs_count];
+ unsigned int prev_reg_val[FT_DETECT_REGS_COUNT];
memset(prev_reg_val, 0, sizeof(prev_reg_val));
@@ -3300,7 +3307,8 @@
unsigned int *prev_reg_val)
{
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
- unsigned int curr_reg_val[ft_detect_regs_count];
+ struct adreno_ringbuffer *rb = &adreno_dev->ringbuffer;
+ unsigned int curr_reg_val[FT_DETECT_REGS_COUNT];
unsigned int fast_hang_detected = 1;
unsigned int long_ib_detected = 1;
unsigned int i;
@@ -3320,7 +3328,9 @@
if (!(adreno_dev->ringbuffer.flags & KGSL_FLAGS_STARTED))
return 0;
- if (is_adreno_rbbm_status_idle(device)) {
+ if (is_adreno_rbbm_status_idle(device) &&
+ (kgsl_readtimestamp(device, NULL, KGSL_TIMESTAMP_RETIRED)
+ == rb->timestamp[KGSL_MEMSTORE_GLOBAL])) {
/*
* On A2XX if the RPTR != WPTR and the device is idle, then
@@ -3352,7 +3362,7 @@
msecs_to_jiffies(KGSL_TIMEOUT_PART-1));
/* Read the current Hang detect reg values here */
- for (i = 0; i < ft_detect_regs_count; i++) {
+ for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
if (ft_detect_regs[i] == 0)
continue;
adreno_regread(device, ft_detect_regs[i],
@@ -3387,7 +3397,7 @@
"Fault tolerance no context found\n");
}
}
- for (i = 0; i < ft_detect_regs_count; i++) {
+ for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
if (curr_reg_val[i] != prev_reg_val[i]) {
fast_hang_detected = 0;
@@ -3463,55 +3473,11 @@
/* If hangs are not detected copy the current reg values
* to previous values and return no hang */
- for (i = 0; i < ft_detect_regs_count; i++)
+ for (i = 0; i < FT_DETECT_REGS_COUNT; i++)
prev_reg_val[i] = curr_reg_val[i];
return 0;
}
-/**
- * adreno_handle_hang - Process a hang detected in adreno_waittimestamp
- * @device - pointer to a KGSL device structure
- * @context - pointer to the active KGSL context
- * @timestamp - the timestamp that the process was waiting for
- *
- * Process a possible GPU hang and try fault tolerance from it
- * cleanly
- */
-static int adreno_handle_hang(struct kgsl_device *device,
- struct kgsl_context *context, unsigned int timestamp)
-{
- struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
- unsigned int context_id = _get_context_id(context);
- unsigned int ts_issued;
- unsigned int rptr;
-
- /* Do one last check to see if we somehow made it through */
- if (kgsl_check_timestamp(device, context, timestamp))
- return 0;
-
- ts_issued = adreno_dev->ringbuffer.timestamp[context_id];
-
- adreno_regread(device, REG_CP_RB_RPTR, &rptr);
-
- /* Make sure timestamp check finished before triggering a hang */
- mb();
-
- KGSL_DRV_WARN(device,
- "Device hang detected while waiting for timestamp: "
- "<%d:0x%x>, last submitted timestamp: <%d:0x%x>, "
- "retired timestamp: <%d:0x%x>, wptr: 0x%x, rptr: 0x%x\n",
- context_id, timestamp, context_id, ts_issued, context_id,
- kgsl_readtimestamp(device, context,
- KGSL_TIMESTAMP_RETIRED),
- adreno_dev->ringbuffer.wptr, rptr);
-
- /* Return 0 after a successful fault tolerance */
- if (!adreno_dump_and_exec_ft(device))
- return 0;
-
- return -ETIMEDOUT;
-}
-
static int _check_pending_timestamp(struct kgsl_device *device,
struct kgsl_context *context, unsigned int timestamp)
{
@@ -3560,7 +3526,6 @@
struct adreno_context *adreno_ctx = context ? context->devctxt : NULL;
struct kgsl_pwrctrl *pwr = &device->pwrctrl;
unsigned int context_id = _get_context_id(context);
- unsigned int prev_reg_val[ft_detect_regs_count];
unsigned int time_elapsed = 0;
unsigned int wait;
int ts_compare = 1;
@@ -3588,10 +3553,6 @@
context->wait_on_invalid_ts = false;
}
-
- /* Clear the registers used for hang detection */
- memset(prev_reg_val, 0, sizeof(prev_reg_val));
-
/*
* On the first time through the loop only wait 100ms.
* this gives enough time for the engine to start moving and oddly
@@ -3618,12 +3579,6 @@
break;
}
- /* Check to see if the GPU is hung */
- if (adreno_ft_detect(device, prev_reg_val)) {
- ret = adreno_handle_hang(device, context, timestamp);
- break;
- }
-
/*
* For proper power accounting sometimes we need to call
* io_wait_interruptible_timeout and sometimes we need to call
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 782209d..531a77c 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -247,6 +247,8 @@
unsigned int replay_for_snapshot;
};
+#define FT_DETECT_REGS_COUNT 12
+
/* Fault Tolerance policy flags */
#define KGSL_FT_DISABLE BIT(0)
#define KGSL_FT_REPLAY BIT(1)
@@ -285,7 +287,6 @@
extern const unsigned int a330_registers_count;
extern unsigned int ft_detect_regs[];
-extern const unsigned int ft_detect_regs_count;
int adreno_coresight_enable(struct coresight_device *csdev);
void adreno_coresight_disable(struct coresight_device *csdev);
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index d12853f..70223db 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -66,7 +66,7 @@
unsigned long wait_time;
unsigned long wait_timeout = msecs_to_jiffies(ADRENO_IDLE_TIMEOUT);
unsigned long wait_time_part;
- unsigned int prev_reg_val[ft_detect_regs_count];
+ unsigned int prev_reg_val[FT_DETECT_REGS_COUNT];
memset(prev_reg_val, 0, sizeof(prev_reg_val));
diff --git a/drivers/gpu/msm/kgsl.c b/drivers/gpu/msm/kgsl.c
index 1d2c341..becb4ef 100644
--- a/drivers/gpu/msm/kgsl.c
+++ b/drivers/gpu/msm/kgsl.c
@@ -39,6 +39,7 @@
#include "kgsl_device.h"
#include "kgsl_trace.h"
#include "kgsl_sync.h"
+#include "adreno.h"
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "kgsl."
@@ -59,6 +60,54 @@
};
/**
+ * kgsl_hang_check() - Check for GPU hang
+ * data: KGSL device structure
+ *
+ * This function is called every KGSL_TIMEOUT_PART time when
+ * GPU is active to check for hang. If a hang is detected we
+ * trigger fault tolerance.
+ */
+void kgsl_hang_check(struct work_struct *work)
+{
+ struct kgsl_device *device = container_of(work, struct kgsl_device,
+ hang_check_ws);
+ static unsigned int prev_reg_val[FT_DETECT_REGS_COUNT];
+
+ mutex_lock(&device->mutex);
+
+ if (device->state == KGSL_STATE_ACTIVE) {
+
+ /* Check to see if the GPU is hung */
+ if (adreno_ft_detect(device, prev_reg_val))
+ adreno_dump_and_exec_ft(device);
+
+ mod_timer(&device->hang_timer,
+ (jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART)));
+ }
+
+ mutex_unlock(&device->mutex);
+}
+
+/**
+ * hang_timer() - Hang timer function
+ * data: KGSL device structure
+ *
+ * This function is called when hang timer expires, in this
+ * function we check if GPU is in active state and queue the
+ * work on device workqueue to check for the hang. We restart
+ * the timer after KGSL_TIMEOUT_PART time.
+ */
+void hang_timer(unsigned long data)
+{
+ struct kgsl_device *device = (struct kgsl_device *) data;
+
+ if (device->state == KGSL_STATE_ACTIVE) {
+ /* Have work run in a non-interrupt context. */
+ queue_work(device->work_queue, &device->hang_check_ws);
+ }
+}
+
+/**
* kgsl_trace_issueibcmds() - Call trace_issueibcmds by proxy
* device: KGSL device
* id: ID of the context submitting the command
@@ -550,6 +599,7 @@
/* Don't let the timer wake us during suspended sleep. */
del_timer_sync(&device->idle_timer);
+ del_timer_sync(&device->hang_timer);
switch (device->state) {
case KGSL_STATE_INIT:
break;
@@ -3292,6 +3342,7 @@
setup_timer(&device->idle_timer, kgsl_timer, (unsigned long) device);
+ setup_timer(&device->hang_timer, hang_timer, (unsigned long) device);
status = kgsl_create_device_workqueue(device);
if (status)
goto error_pwrctrl_close;
@@ -3371,9 +3422,7 @@
/* Disable the idle timer so we don't get interrupted */
del_timer_sync(&device->idle_timer);
- mutex_unlock(&device->mutex);
- flush_workqueue(device->work_queue);
- mutex_lock(&device->mutex);
+ del_timer_sync(&device->hang_timer);
/* Force on the clocks */
kgsl_pwrctrl_wake(device);
diff --git a/drivers/gpu/msm/kgsl.h b/drivers/gpu/msm/kgsl.h
index 4a1f291..0983111 100644
--- a/drivers/gpu/msm/kgsl.h
+++ b/drivers/gpu/msm/kgsl.h
@@ -203,6 +203,7 @@
#define MMU_CONFIG 1
#endif
+void kgsl_hang_check(struct work_struct *work);
void kgsl_mem_entry_destroy(struct kref *kref);
int kgsl_postmortem_dump(struct kgsl_device *device, int manual);
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index 6477cbd..94792a0 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -172,7 +172,9 @@
struct completion hwaccess_gate;
const struct kgsl_functable *ftbl;
struct work_struct idle_check_ws;
+ struct work_struct hang_check_ws;
struct timer_list idle_timer;
+ struct timer_list hang_timer;
struct kgsl_pwrctrl pwrctrl;
int open_count;
@@ -237,6 +239,8 @@
.ft_gate = COMPLETION_INITIALIZER((_dev).ft_gate),\
.idle_check_ws = __WORK_INITIALIZER((_dev).idle_check_ws,\
kgsl_idle_check),\
+ .hang_check_ws = __WORK_INITIALIZER((_dev).hang_check_ws,\
+ kgsl_hang_check),\
.ts_expired_ws = __WORK_INITIALIZER((_dev).ts_expired_ws,\
kgsl_process_events),\
.context_idr = IDR_INIT((_dev).context_idr),\
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 847c59e..3a2345e 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -1242,6 +1242,7 @@
kgsl_pwrctrl_request_state(device, KGSL_STATE_NONE);
return -EBUSY;
}
+ del_timer_sync(&device->hang_timer);
kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
kgsl_pwrctrl_clk(device, KGSL_PWRFLAGS_OFF, KGSL_STATE_NAP);
kgsl_pwrctrl_set_state(device, KGSL_STATE_NAP);
@@ -1311,6 +1312,7 @@
case KGSL_STATE_NAP:
case KGSL_STATE_SLEEP:
del_timer_sync(&device->idle_timer);
+ del_timer_sync(&device->hang_timer);
/* make sure power is on to stop the device*/
kgsl_pwrctrl_enable(device);
device->ftbl->suspend_context(device);
@@ -1403,6 +1405,8 @@
/* Re-enable HW access */
mod_timer(&device->idle_timer,
jiffies + device->pwrctrl.interval_timeout);
+ mod_timer(&device->hang_timer,
+ (jiffies + msecs_to_jiffies(KGSL_TIMEOUT_PART)));
pm_qos_update_request(&device->pwrctrl.pm_qos_req_dma,
device->pwrctrl.pm_qos_latency);
case KGSL_STATE_ACTIVE: