Merge "msm: kgsl: Implement KGSL fault tolerance policy in the dispatcher"
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 184dd982..3d1a3ae 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -114,18 +114,6 @@
.long_ib_detect = 1,
};
-/* This set of registers are used for Hang detection
- * If the values of these registers are same after
- * KGSL_TIMEOUT_PART time, GPU hang is reported in
- * kernel log.
- * *****ALERT******ALERT********ALERT*************
- * Order of registers below is important, registers
- * from LONG_IB_DETECT_REG_INDEX_START to
- * LONG_IB_DETECT_REG_INDEX_END are used in long ib detection.
- */
-#define LONG_IB_DETECT_REG_INDEX_START 1
-#define LONG_IB_DETECT_REG_INDEX_END 5
-
unsigned int ft_detect_regs[FT_DETECT_REGS_COUNT];
/*
@@ -1722,6 +1710,9 @@
(device->pwrctrl.gpu_cx &&
regulator_is_enabled(device->pwrctrl.gpu_cx)));
+ /* Clear any GPU faults that might have been left over */
+ adreno_set_gpu_fault(adreno_dev, 0);
+
/* Power up the device */
kgsl_pwrctrl_enable(device);
@@ -1830,29 +1821,35 @@
*/
int adreno_reset(struct kgsl_device *device)
{
- int ret;
+ int ret = 0;
/* Try soft reset first */
- if (adreno_soft_reset(device) == 0)
- return 0;
+ if (adreno_soft_reset(device) != 0) {
+ KGSL_DEV_ERR_ONCE(device, "Device soft reset failed\n");
- /* If it failed, then pull the power */
- ret = adreno_stop(device);
- if (ret)
- return ret;
+ /* If it failed, then pull the power */
+ ret = adreno_stop(device);
+ if (ret)
+ return ret;
- ret = adreno_start(device);
+ ret = adreno_start(device);
- if (ret == 0) {
- /*
- * If active_cnt is non-zero then the system was active before
- * going into a reset - put it back in that state
- */
-
- if (atomic_read(&device->active_cnt))
- kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
+ if (ret)
+ return ret;
}
+ /*
+ * If active_cnt is non-zero then the system was active before
+ * going into a reset - put it back in that state
+ */
+
+ if (atomic_read(&device->active_cnt))
+ kgsl_pwrctrl_set_state(device, KGSL_STATE_ACTIVE);
+
+ /* Set the page table back to the default page table */
+ kgsl_mmu_setstate(&device->mmu, device->mmu.defaultpagetable,
+ KGSL_MEMSTORE_GLOBAL);
+
return ret;
}
@@ -2310,6 +2307,13 @@
/* Stop the ringbuffer */
adreno_ringbuffer_stop(&adreno_dev->ringbuffer);
+ if (kgsl_pwrctrl_isenabled(device))
+ device->ftbl->irqctrl(device, 0);
+
+ kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
+
+ adreno_set_gpu_fault(adreno_dev, 0);
+
/* Delete the idle timer */
del_timer_sync(&device->idle_timer);
@@ -2391,12 +2395,20 @@
0x110, 0x110);
while (time_before(jiffies, wait)) {
+ /*
+ * If we fault, stop waiting and return an error. The dispatcher
+ * will clean up the fault from the work queue, but we need to
+ * make sure we don't block it by waiting for an idle that
+ * will never come.
+ */
+
+ if (adreno_gpu_fault(adreno_dev) != 0)
+ return -EDEADLK;
+
if (adreno_isidle(device))
return 0;
}
- kgsl_postmortem_dump(device, 0);
-
return -ETIMEDOUT;
}
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 32e43b2..d070e29 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -35,12 +35,11 @@
#define ADRENO_CHIPID_PATCH(_id) ((_id) & 0xFF)
/* Flags to control command packet settings */
-#define KGSL_CMD_FLAGS_NONE 0x00000000
-#define KGSL_CMD_FLAGS_PMODE 0x00000001
-#define KGSL_CMD_FLAGS_INTERNAL_ISSUE 0x00000002
-#define KGSL_CMD_FLAGS_GET_INT 0x00000004
-#define KGSL_CMD_FLAGS_PROFILE 0x00000008
-#define KGSL_CMD_FLAGS_EOF 0x00000100
+#define KGSL_CMD_FLAGS_NONE 0
+#define KGSL_CMD_FLAGS_PMODE BIT(0)
+#define KGSL_CMD_FLAGS_INTERNAL_ISSUE BIT(1)
+#define KGSL_CMD_FLAGS_WFI BIT(2)
+#define KGSL_CMD_FLAGS_PROFILE BIT(3)
/* Command identifiers */
#define KGSL_CONTEXT_TO_MEM_IDENTIFIER 0x2EADBEEF
@@ -96,6 +95,10 @@
TRACE_BUS_CTL,
};
+#define ADRENO_SOFT_FAULT 1
+#define ADRENO_HARD_FAULT 2
+#define ADRENO_TIMEOUT_FAULT 3
+
/*
* Maximum size of the dispatcher ringbuffer - the actual inflight size will be
* smaller then this but this size will allow for a larger range of inflight
@@ -110,7 +113,7 @@
* @state: Current state of the dispatcher (active or paused)
* @timer: Timer to monitor the progress of the command batches
* @inflight: Number of command batch operations pending in the ringbuffer
- * @fault: True if a HW fault was detected
+ * @fault: Non-zero if a fault was detected.
* @pending: Priority list of contexts waiting to submit command batches
* @plist_lock: Spin lock to protect the pending queue
* @cmdqueue: Queue of command batches currently flight
@@ -125,8 +128,9 @@
struct mutex mutex;
unsigned int state;
struct timer_list timer;
+ struct timer_list fault_timer;
unsigned int inflight;
- int fault;
+ atomic_t fault;
struct plist_head pending;
spinlock_t plist_lock;
struct kgsl_cmdbatch *cmdqueue[ADRENO_DISPATCH_CMDQUEUE_SIZE];
@@ -340,13 +344,16 @@
};
/* Fault Tolerance policy flags */
-#define KGSL_FT_OFF BIT(0)
-#define KGSL_FT_REPLAY BIT(1)
-#define KGSL_FT_SKIPIB BIT(2)
-#define KGSL_FT_SKIPFRAME BIT(3)
-#define KGSL_FT_DISABLE BIT(4)
-#define KGSL_FT_TEMP_DISABLE BIT(5)
-#define KGSL_FT_DEFAULT_POLICY (KGSL_FT_REPLAY + KGSL_FT_SKIPIB)
+#define KGSL_FT_OFF 0
+#define KGSL_FT_REPLAY 1
+#define KGSL_FT_SKIPIB 2
+#define KGSL_FT_SKIPFRAME 3
+#define KGSL_FT_DISABLE 4
+#define KGSL_FT_TEMP_DISABLE 5
+#define KGSL_FT_DEFAULT_POLICY (BIT(KGSL_FT_REPLAY) + BIT(KGSL_FT_SKIPIB))
+
+/* This internal bit is used to skip the PM dump on replayed command batches */
+#define KGSL_FT_SKIP_PMDUMP 31
/* Pagefault policy flags */
#define KGSL_FT_PAGEFAULT_INT_ENABLE BIT(0)
@@ -356,6 +363,14 @@
#define KGSL_FT_PAGEFAULT_DEFAULT_POLICY (KGSL_FT_PAGEFAULT_INT_ENABLE + \
KGSL_FT_PAGEFAULT_GPUHALT_ENABLE)
+#define ADRENO_FT_TYPES \
+ { BIT(KGSL_FT_OFF), "off" }, \
+ { BIT(KGSL_FT_REPLAY), "replay" }, \
+ { BIT(KGSL_FT_SKIPIB), "skipib" }, \
+ { BIT(KGSL_FT_SKIPFRAME), "skipframe" }, \
+ { BIT(KGSL_FT_DISABLE), "disable" }, \
+ { BIT(KGSL_FT_TEMP_DISABLE), "temp" }
+
extern struct adreno_gpudev adreno_a2xx_gpudev;
extern struct adreno_gpudev adreno_a3xx_gpudev;
@@ -741,4 +756,31 @@
return ADRENO_REG_REGISTER_MAX;
return adreno_dev->gpudev->reg_offsets->offsets[offset_name];
}
+
+/**
+ * adreno_gpu_fault() - Return the current state of the GPU
+ * @adreno_dev: A ponter to the adreno_device to query
+ *
+ * Return 0 if there is no fault or positive with the last type of fault that
+ * occurred
+ */
+static inline unsigned int adreno_gpu_fault(struct adreno_device *adreno_dev)
+{
+ smp_rmb();
+ return atomic_read(&adreno_dev->dispatcher.fault);
+}
+
+/**
+ * adreno_set_gpu_fault() - Set the current fault status of the GPU
+ * @adreno_dev: A pointer to the adreno_device to set
+ * @state: fault state to set
+ *
+ */
+static inline void adreno_set_gpu_fault(struct adreno_device *adreno_dev,
+ int state)
+{
+ atomic_set(&adreno_dev->dispatcher.fault, state);
+ smp_wmb();
+}
+
#endif /*__ADRENO_H */
diff --git a/drivers/gpu/msm/adreno_a3xx.c b/drivers/gpu/msm/adreno_a3xx.c
index 8b75c4e..c4f81fa 100644
--- a/drivers/gpu/msm/adreno_a3xx.c
+++ b/drivers/gpu/msm/adreno_a3xx.c
@@ -2596,7 +2596,7 @@
/* Clear the error */
kgsl_regwrite(device, A3XX_RBBM_AHB_CMD, (1 << 3));
- return;
+ goto done;
}
case A3XX_INT_RBBM_REG_TIMEOUT:
err = "RBBM: AHB register timeout";
@@ -2637,10 +2637,15 @@
case A3XX_INT_UCHE_OOB_ACCESS:
err = "UCHE: Out of bounds access";
break;
+ default:
+ return;
}
-
KGSL_DRV_CRIT(device, "%s\n", err);
kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
+
+done:
+ /* Trigger a fault in the dispatcher - this will effect a restart */
+ adreno_dispatcher_irq_fault(device);
}
static void a3xx_cp_callback(struct adreno_device *adreno_dev, int irq)
diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c
index e429934..4f1d7ec 100644
--- a/drivers/gpu/msm/adreno_dispatch.c
+++ b/drivers/gpu/msm/adreno_dispatch.c
@@ -41,6 +41,71 @@
/* Command batch timeout (in milliseconds) */
static unsigned int _cmdbatch_timeout = 2000;
+/* Interval for reading and comparing fault detection registers */
+static unsigned int _fault_timer_interval = 50;
+
+/* Local array for the current set of fault detect registers */
+static unsigned int fault_detect_regs[FT_DETECT_REGS_COUNT];
+
+/* The last retired global timestamp read during fault detect */
+static unsigned int fault_detect_ts;
+
+/**
+ * fault_detect_read() - Read the set of fault detect registers
+ * @device: Pointer to the KGSL device struct
+ *
+ * Read the set of fault detect registers and store them in the local array.
+ * This is for the initial values that are compared later with
+ * fault_detect_read_compare
+ */
+static void fault_detect_read(struct kgsl_device *device)
+{
+ int i;
+
+ fault_detect_ts = kgsl_readtimestamp(device, NULL,
+ KGSL_TIMESTAMP_RETIRED);
+
+ for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
+ if (ft_detect_regs[i] == 0)
+ continue;
+ kgsl_regread(device, ft_detect_regs[i],
+ &fault_detect_regs[i]);
+ }
+}
+
+/**
+ * fault_detect_read_compare() - Read the fault detect registers and compare
+ * them to the current value
+ * @device: Pointer to the KGSL device struct
+ *
+ * Read the set of fault detect registers and compare them to the current set
+ * of registers. Return 1 if any of the register values changed
+ */
+static int fault_detect_read_compare(struct kgsl_device *device)
+{
+ int i, ret = 0;
+ unsigned int ts;
+
+ for (i = 0; i < FT_DETECT_REGS_COUNT; i++) {
+ unsigned int val;
+
+ if (ft_detect_regs[i] == 0)
+ continue;
+ kgsl_regread(device, ft_detect_regs[i], &val);
+ if (val != fault_detect_regs[i])
+ ret = 1;
+ fault_detect_regs[i] = val;
+ }
+
+ ts = kgsl_readtimestamp(device, NULL, KGSL_TIMESTAMP_RETIRED);
+ if (ts != fault_detect_ts)
+ ret = 1;
+
+ fault_detect_ts = ts;
+
+ return ret;
+}
+
/**
* adreno_dispatcher_get_cmdbatch() - Get a new command from a context queue
* @drawctxt: Pointer to the adreno draw context
@@ -162,9 +227,17 @@
ret = adreno_ringbuffer_submitcmd(adreno_dev, cmdbatch);
- /* Turn the GPU back off on failure. Sad face. */
- if (ret && dispatcher->inflight == 1)
- kgsl_active_count_put(device);
+ /*
+ * On the first command, if the submission was successful, then read the
+ * fault registers. If it failed then turn off the GPU. Sad face.
+ */
+
+ if (dispatcher->inflight == 1) {
+ if (ret == 0)
+ fault_detect_read(device);
+ else
+ kgsl_active_count_put(device);
+ }
mutex_unlock(&device->mutex);
@@ -191,6 +264,12 @@
cmdbatch->expires = jiffies +
msecs_to_jiffies(_cmdbatch_timeout);
mod_timer(&dispatcher->timer, cmdbatch->expires);
+
+ /* Start the fault detection timer */
+ if (adreno_dev->fast_hang_detect)
+ mod_timer(&dispatcher->fault_timer,
+ jiffies +
+ msecs_to_jiffies(_fault_timer_interval));
}
return 0;
@@ -343,130 +422,6 @@
}
/**
- * adreno_dispatcher_replay() - Replay commands from the dispatcher queue
- * @adreno_dev: Pointer to the adreno device struct
- *
- * Replay the commands from the dispatcher inflight queue. This is called after
- * a power down/up to recover from a fault
- */
-int adreno_dispatcher_replay(struct adreno_device *adreno_dev)
-{
- struct kgsl_device *device = &adreno_dev->dev;
- struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
- struct kgsl_cmdbatch **replay;
- int i, ptr, count = 0;
-
- BUG_ON(!mutex_is_locked(&dispatcher->mutex));
-
- replay = kzalloc(sizeof(*replay) * dispatcher->inflight, GFP_KERNEL);
-
- /*
- * If we can't allocate enough memory for the replay commands then we
- * are in a bad way. Invalidate everything, reset the GPU and see ya
- * later alligator
- */
-
- if (replay == NULL) {
-
- ptr = dispatcher->head;
-
- while (ptr != dispatcher->tail) {
- struct kgsl_context *context =
- dispatcher->cmdqueue[ptr]->context;
-
- adreno_drawctxt_invalidate(device, context);
- ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
- }
-
- /* Reset the dispatcher queue */
- dispatcher->inflight = 0;
- dispatcher->head = dispatcher->tail = 0;
-
- /* Reset the hardware */
- mutex_lock(&device->mutex);
-
- /*
- * If adreno_reset fails then the GPU is not alive and there
- * isn't anything we can do to recover at this point
- */
-
- BUG_ON(adreno_reset(device));
- mutex_unlock(&device->mutex);
-
- return 0;
- }
-
- ptr = dispatcher->head;
-
- while (ptr != dispatcher->tail) {
- struct kgsl_cmdbatch *cmdbatch = dispatcher->cmdqueue[ptr];
- struct adreno_context *drawctxt =
- ADRENO_CONTEXT(cmdbatch->context);
-
- if (cmdbatch->invalid)
- adreno_drawctxt_invalidate(device, cmdbatch->context);
-
- if (!kgsl_context_detached(cmdbatch->context) &&
- drawctxt->state == ADRENO_CONTEXT_STATE_ACTIVE) {
- /*
- * The context for the command batch is still valid -
- * add it to the replay list
- */
- replay[count++] = dispatcher->cmdqueue[ptr];
- } else {
- /*
- * Skip over invaliated or detached contexts - cancel
- * any pending events for the timestamp and destroy the
- * command batch
- */
- mutex_lock(&device->mutex);
- kgsl_cancel_events_timestamp(device, cmdbatch->context,
- cmdbatch->timestamp);
- mutex_unlock(&device->mutex);
-
- kgsl_cmdbatch_destroy(cmdbatch);
- }
-
- ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
- }
-
- /* Reset the dispatcher queue */
- dispatcher->inflight = 0;
- dispatcher->head = dispatcher->tail = 0;
-
- mutex_lock(&device->mutex);
- BUG_ON(adreno_reset(device));
- mutex_unlock(&device->mutex);
-
- /* Replay the pending command buffers */
- for (i = 0; i < count; i++) {
- int ret = sendcmd(adreno_dev, replay[i]);
-
- /*
- * I'm afraid that if we get an error during replay we
- * are not going to space today
- */
-
- BUG_ON(ret);
- }
-
- /*
- * active_count will be set when we come into this function because
- * there were inflight commands. By virtue of setting ->inflight back
- * to 0 sendcmd() will increase the active count again on the first
- * submission. This active_count_put is needed to put the universe back
- * in balance and as a bonus it ensures that the hardware stays up for
- * the entire reset process
- */
- mutex_lock(&device->mutex);
- kgsl_active_count_put(device);
- mutex_unlock(&device->mutex);
-
- kfree(replay);
- return 0;
-}
-
-/**
* adreno_dispatcher_queue_cmd() - Queue a new command in the context
* @adreno_dev: Pointer to the adreno device struct
* @drawctxt: Pointer to the adreno draw context
@@ -489,6 +444,42 @@
return -EINVAL;
}
+ /*
+ * After skipping to the end of the frame we need to force the preamble
+ * to run (if it exists) regardless of the context state.
+ */
+
+ if (drawctxt->flags & CTXT_FLAGS_FORCE_PREAMBLE) {
+ set_bit(CMDBATCH_FLAG_FORCE_PREAMBLE, &cmdbatch->priv);
+ drawctxt->flags &= ~CTXT_FLAGS_FORCE_PREAMBLE;
+ }
+
+ /*
+ * If we are waiting for the end of frame and it hasn't appeared yet,
+ * then mark the command batch as skipped. It will still progress
+ * through the pipeline but it won't actually send any commands
+ */
+
+ if (drawctxt->flags & CTXT_FLAGS_SKIP_EOF) {
+ set_bit(CMDBATCH_FLAG_SKIP, &cmdbatch->priv);
+
+ /*
+ * If this command batch represents the EOF then clear the way
+ * for the dispatcher to continue submitting
+ */
+
+ if (cmdbatch->flags & KGSL_CONTEXT_END_OF_FRAME) {
+ drawctxt->flags &= ~CTXT_FLAGS_SKIP_EOF;
+
+ /*
+ * Force the preamble on the next command to ensure that
+ * the state is correct
+ */
+
+ drawctxt->flags |= CTXT_FLAGS_FORCE_PREAMBLE;
+ }
+ }
+
/* Wait for room in the context queue */
while (drawctxt->queued >= _context_cmdqueue_size) {
@@ -536,6 +527,16 @@
cmdbatch->timestamp = drawctxt->timestamp;
*timestamp = drawctxt->timestamp;
+ /*
+ * Set the fault tolerance policy for the command batch - assuming the
+ * context hsn't disabled FT use the current device policy
+ */
+
+ if (drawctxt->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE)
+ set_bit(KGSL_FT_DISABLE, &cmdbatch->fault_policy);
+ else
+ cmdbatch->fault_policy = adreno_dev->ft_policy;
+
/* Put the command into the queue */
drawctxt->cmdqueue[drawctxt->cmdqueue_tail] = cmdbatch;
drawctxt->cmdqueue_tail = (drawctxt->cmdqueue_tail + 1) %
@@ -565,58 +566,442 @@
return 0;
}
-/**
- * dispatcher_do_fault() - Handle a GPU fault and reset the GPU
- * @device: Pointer to the KGSL device
- * @cmdbatch: Pointer to the command batch believed to be responsible for the
- * fault
- * @invalidate: Non zero if the current command should be invalidated
- *
- * Trigger a fault in the dispatcher and start the replay process
+/*
+ * If an IB inside of the command batch has a gpuaddr that matches the base
+ * passed in then zero the size which effectively skips it when it is submitted
+ * in the ringbuffer.
*/
-static void dispatcher_do_fault(struct kgsl_device *device,
- struct kgsl_cmdbatch *cmdbatch, int invalidate)
+static void cmdbatch_skip_ib(struct kgsl_cmdbatch *cmdbatch, unsigned int base)
+{
+ int i;
+
+ for (i = 0; i < cmdbatch->ibcount; i++) {
+ if (cmdbatch->ibdesc[i].gpuaddr == base) {
+ cmdbatch->ibdesc[i].sizedwords = 0;
+ return;
+ }
+ }
+}
+
+static void cmdbatch_skip_frame(struct kgsl_cmdbatch *cmdbatch,
+ struct kgsl_cmdbatch **replay, int count)
+{
+ struct adreno_context *drawctxt = ADRENO_CONTEXT(cmdbatch->context);
+ int skip = 1;
+ int i;
+
+ for (i = 0; i < count; i++) {
+
+ /*
+ * Only operate on command batches that belong to the
+ * faulting context
+ */
+
+ if (replay[i]->context->id != cmdbatch->context->id)
+ continue;
+
+ /*
+ * Skip all the command batches in this context until
+ * the EOF flag is seen. If the EOF flag is seen then
+ * force the preamble for the next command.
+ */
+
+ if (skip) {
+ set_bit(CMDBATCH_FLAG_SKIP, &replay[i]->priv);
+
+ if (replay[i]->flags & KGSL_CONTEXT_END_OF_FRAME)
+ skip = 0;
+ } else {
+ set_bit(CMDBATCH_FLAG_FORCE_PREAMBLE, &replay[i]->priv);
+ return;
+ }
+ }
+
+ /*
+ * If the EOF flag hasn't been seen yet then set the flag in the
+ * drawctxt to keep looking for it
+ */
+
+ if (skip && drawctxt)
+ drawctxt->flags |= CTXT_FLAGS_SKIP_EOF;
+
+ /*
+ * If we did see the EOF flag then force the preamble on for the
+ * next command issued on this context
+ */
+
+ if (!skip && drawctxt)
+ drawctxt->flags |= CTXT_FLAGS_FORCE_PREAMBLE;
+}
+
+static void remove_invalidated_cmdbatches(struct kgsl_device *device,
+ struct kgsl_cmdbatch **replay, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ struct kgsl_cmdbatch *cmd = replay[i];
+ struct adreno_context *drawctxt;
+
+ if (cmd == NULL)
+ continue;
+
+ drawctxt = ADRENO_CONTEXT(cmd->context);
+
+ if (kgsl_context_detached(cmd->context) ||
+ drawctxt->state == ADRENO_CONTEXT_STATE_INVALID) {
+ replay[i] = NULL;
+
+ mutex_lock(&device->mutex);
+ kgsl_cancel_events_timestamp(device, cmd->context,
+ cmd->timestamp);
+ mutex_unlock(&device->mutex);
+
+ kgsl_cmdbatch_destroy(cmd);
+ }
+ }
+}
+
+static char _pidname[TASK_COMM_LEN];
+
+static inline const char *_kgsl_context_comm(struct kgsl_context *context)
+{
+ struct task_struct *task = NULL;
+
+ if (context)
+ task = find_task_by_vpid(context->pid);
+
+ if (task)
+ get_task_comm(_pidname, task);
+ else
+ snprintf(_pidname, TASK_COMM_LEN, "unknown");
+
+ return _pidname;
+}
+
+#define pr_fault(_d, _c, fmt, args...) \
+ dev_err((_d)->dev, "%s[%d]: " fmt, \
+ _kgsl_context_comm((_c)->context), \
+ (_c)->context->pid, ##args)
+
+
+static void adreno_fault_header(struct kgsl_device *device,
+ struct kgsl_cmdbatch *cmdbatch)
+{
+ struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+ unsigned int status, base, rptr, wptr, ib1base, ib2base, ib1sz, ib2sz;
+
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_RBBM_STATUS),
+ &status);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_RB_BASE),
+ &base);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_RB_RPTR),
+ &rptr);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_RB_WPTR),
+ &wptr);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_IB1_BASE),
+ &ib1base);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_IB1_BUFSZ),
+ &ib1sz);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_IB2_BASE),
+ &ib2base);
+ kgsl_regread(device,
+ adreno_getreg(adreno_dev, ADRENO_REG_CP_IB2_BUFSZ),
+ &ib2sz);
+
+ trace_adreno_gpu_fault(cmdbatch->context->id, cmdbatch->timestamp,
+ status, rptr, wptr, ib1base, ib1sz, ib2base, ib2sz);
+
+ pr_fault(device, cmdbatch,
+ "gpu fault ctx %d ts %d status %8.8X rb %4.4x/%4.4x ib1 %8.8x/%4.4x ib2 %8.8x/%4.4x\n",
+ cmdbatch->context->id, cmdbatch->timestamp, status,
+ rptr, wptr, ib1base, ib1sz, ib2base, ib2sz);
+}
+
+static int dispatcher_do_fault(struct kgsl_device *device)
{
struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
- unsigned int reg;
+ unsigned int ptr;
+ unsigned int reg, base;
+ struct kgsl_cmdbatch **replay = NULL;
+ struct kgsl_cmdbatch *cmdbatch;
+ int ret, i, count = 0;
+ int fault, first = 0;
+ bool pagefault = false;
+ BUG_ON(dispatcher->inflight == 0);
- /* Stop the timers */
+ fault = atomic_xchg(&dispatcher->fault, 0);
+ if (fault == 0)
+ return 0;
+
+ /* Turn off all the timers */
del_timer_sync(&dispatcher->timer);
+ del_timer_sync(&dispatcher->fault_timer);
mutex_lock(&device->mutex);
- /*
- * There is an interesting race condition here - when a command batch
- * expires and we invaliate before we recover we run the risk of having
- * the UMD clean up the context and free memory that the GPU is still
- * using. Not that it is dangerous because we are a few microseconds
- * away from resetting, but it still ends up in pagefaults and log
- * messages and so on and so forth. To avoid this we mark the command
- * batch itself as invalid and then reset - the context will get
- * invalidated in the replay.
- */
+ cmdbatch = dispatcher->cmdqueue[dispatcher->head];
- if (invalidate)
- cmdbatch->invalid = 1;
+ trace_adreno_cmdbatch_fault(cmdbatch, fault);
/*
- * Stop the CP in its tracks - this ensures that we don't get activity
- * while we are trying to dump the state of the system
+ * If the fault was due to a timeout then stop the CP to ensure we don't
+ * get activity while we are trying to dump the state of the system
*/
+ if (fault == ADRENO_TIMEOUT_FAULT) {
+ adreno_readreg(adreno_dev, ADRENO_REG_CP_ME_CNTL, ®);
+ reg |= (1 << 27) | (1 << 28);
+ adreno_writereg(adreno_dev, ADRENO_REG_CP_ME_CNTL, reg);
- adreno_readreg(adreno_dev, ADRENO_REG_CP_ME_CNTL, ®);
- reg |= (1 << 27) | (1 << 28);
- adreno_writereg(adreno_dev, ADRENO_REG_CP_ME_CNTL, reg);
+ /* Skip the PM dump for a timeout because it confuses people */
+ set_bit(KGSL_FT_SKIP_PMDUMP, &cmdbatch->fault_policy);
+ }
- kgsl_postmortem_dump(device, 0);
- kgsl_device_snapshot(device, 1);
+ adreno_readreg(adreno_dev, ADRENO_REG_CP_IB1_BASE, &base);
+
+ /*
+ * Dump the postmortem and snapshot information if this is the first
+ * detected fault for the oldest active command batch
+ */
+
+ if (!test_bit(KGSL_FT_SKIP_PMDUMP, &cmdbatch->fault_policy)) {
+ adreno_fault_header(device, cmdbatch);
+
+ if (device->pm_dump_enable)
+ kgsl_postmortem_dump(device, 0);
+
+ kgsl_device_snapshot(device, 1);
+ }
+
mutex_unlock(&device->mutex);
- /* If we can't replay then bravely run away and die */
- if (adreno_dispatcher_replay(adreno_dev))
- BUG();
+ /* Allocate memory to store the inflight commands */
+ replay = kzalloc(sizeof(*replay) * dispatcher->inflight, GFP_KERNEL);
+
+ if (replay == NULL) {
+ unsigned int ptr = dispatcher->head;
+
+ while (ptr != dispatcher->tail) {
+ struct kgsl_context *context =
+ dispatcher->cmdqueue[ptr]->context;
+
+ adreno_drawctxt_invalidate(device, context);
+ kgsl_cmdbatch_destroy(dispatcher->cmdqueue[ptr]);
+
+ ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
+ }
+
+ /*
+ * Set the replay count to zero - this will ensure that the
+ * hardware gets reset but nothing else goes played
+ */
+
+ count = 0;
+ goto replay;
+ }
+
+ /* Copy the inflight command batches into the temporary storage */
+ ptr = dispatcher->head;
+
+ while (ptr != dispatcher->tail) {
+ replay[count++] = dispatcher->cmdqueue[ptr];
+ ptr = CMDQUEUE_NEXT(ptr, ADRENO_DISPATCH_CMDQUEUE_SIZE);
+ }
+
+ /*
+ * For the purposes of replay, we assume that the oldest command batch
+ * that hasn't retired a timestamp is "hung".
+ */
+
+ cmdbatch = replay[0];
+
+ /*
+ * If FT is disabled for this cmdbatch invalidate immediately
+ */
+
+ if (test_bit(KGSL_FT_DISABLE, &cmdbatch->fault_policy) ||
+ test_bit(KGSL_FT_TEMP_DISABLE, &cmdbatch->fault_policy)) {
+ pr_fault(device, cmdbatch, "gpu skipped ctx %d ts %d\n",
+ cmdbatch->context->id, cmdbatch->timestamp);
+
+ adreno_drawctxt_invalidate(device, cmdbatch->context);
+ }
+
+ /*
+ * Set a flag so we don't print another PM dump if the cmdbatch fails
+ * again on replay
+ */
+
+ set_bit(KGSL_FT_SKIP_PMDUMP, &cmdbatch->fault_policy);
+
+ /*
+ * A hardware fault generally means something was deterministically
+ * wrong with the command batch - no point in trying to replay it
+ * Clear the replay bit and move on to the next policy level
+ */
+
+ if (fault == ADRENO_HARD_FAULT)
+ clear_bit(KGSL_FT_REPLAY, &(cmdbatch->fault_policy));
+
+ /*
+ * A timeout fault means the IB timed out - clear the policy and
+ * invalidate - this will clear the FT_SKIP_PMDUMP bit but that is okay
+ * because we won't see this cmdbatch again
+ */
+
+ if (fault == ADRENO_TIMEOUT_FAULT)
+ bitmap_zero(&cmdbatch->fault_policy, BITS_PER_LONG);
+
+ /*
+ * If the context had a GPU page fault then it is likely it would fault
+ * again if replayed
+ */
+
+ if (test_bit(KGSL_CONTEXT_PAGEFAULT, &cmdbatch->context->priv)) {
+ /* we'll need to resume the mmu later... */
+ pagefault = true;
+ clear_bit(KGSL_FT_REPLAY, &cmdbatch->fault_policy);
+ clear_bit(KGSL_CONTEXT_PAGEFAULT, &cmdbatch->context->priv);
+ }
+
+ /*
+ * Execute the fault tolerance policy. Each command batch stores the
+ * current fault policy that was set when it was queued.
+ * As the options are tried in descending priority
+ * (REPLAY -> SKIPIBS -> SKIPFRAME -> NOTHING) the bits are cleared
+ * from the cmdbatch policy so the next thing can be tried if the
+ * change comes around again
+ */
+
+ /* Replay the hanging command batch again */
+ if (test_and_clear_bit(KGSL_FT_REPLAY, &cmdbatch->fault_policy)) {
+ trace_adreno_cmdbatch_recovery(cmdbatch, BIT(KGSL_FT_REPLAY));
+ set_bit(KGSL_FT_REPLAY, &cmdbatch->fault_recovery);
+ goto replay;
+ }
+
+ /*
+ * Skip the last IB1 that was played but replay everything else.
+ * Note that the last IB1 might not be in the "hung" command batch
+ * because the CP may have caused a page-fault while it was prefetching
+ * the next IB1/IB2. walk all outstanding commands and zap the
+ * supposedly bad IB1 where ever it lurks.
+ */
+
+ if (test_and_clear_bit(KGSL_FT_SKIPIB, &cmdbatch->fault_policy)) {
+ trace_adreno_cmdbatch_recovery(cmdbatch, BIT(KGSL_FT_SKIPIB));
+ set_bit(KGSL_FT_SKIPIB, &cmdbatch->fault_recovery);
+
+ for (i = 0; i < count; i++) {
+ if (replay[i] != NULL)
+ cmdbatch_skip_ib(replay[i], base);
+ }
+
+ goto replay;
+ }
+
+ if (test_and_clear_bit(KGSL_FT_SKIPFRAME, &cmdbatch->fault_policy)) {
+ trace_adreno_cmdbatch_recovery(cmdbatch,
+ BIT(KGSL_FT_SKIPFRAME));
+ set_bit(KGSL_FT_SKIPFRAME, &cmdbatch->fault_recovery);
+
+ /*
+ * Skip all the pending command batches for this context until
+ * the EOF frame is seen
+ */
+ cmdbatch_skip_frame(cmdbatch, replay, count);
+ goto replay;
+ }
+
+ /* If we get here then all the policies failed */
+
+ pr_fault(device, cmdbatch, "gpu failed ctx %d ts %d\n",
+ cmdbatch->context->id, cmdbatch->timestamp);
+
+ /* Invalidate the context */
+ adreno_drawctxt_invalidate(device, cmdbatch->context);
+
+
+replay:
+ /* Reset the dispatcher queue */
+ dispatcher->inflight = 0;
+ dispatcher->head = dispatcher->tail = 0;
+
+ /* Reset the GPU */
+ mutex_lock(&device->mutex);
+
+ /* resume the MMU if it is stalled */
+ if (pagefault && device->mmu.mmu_ops->mmu_pagefault_resume != NULL)
+ device->mmu.mmu_ops->mmu_pagefault_resume(&device->mmu);
+
+ ret = adreno_reset(device);
+ mutex_unlock(&device->mutex);
+
+ /* If adreno_reset() fails then what hope do we have for the future? */
+ BUG_ON(ret);
+
+ /* Remove any pending command batches that have been invalidated */
+ remove_invalidated_cmdbatches(device, replay, count);
+
+ /* Replay the pending command buffers */
+ for (i = 0; i < count; i++) {
+
+ int ret;
+
+ if (replay[i] == NULL)
+ continue;
+
+ /*
+ * Force the preamble on the first command (if applicable) to
+ * avoid any strange stage issues
+ */
+
+ if (first == 0) {
+ set_bit(CMDBATCH_FLAG_FORCE_PREAMBLE, &replay[i]->priv);
+ first = 1;
+ }
+
+ /*
+ * Force each command batch to wait for idle - this avoids weird
+ * CP parse issues
+ */
+
+ set_bit(CMDBATCH_FLAG_WFI, &replay[i]->priv);
+
+ ret = sendcmd(adreno_dev, replay[i]);
+
+ /*
+ * If sending the command fails, then try to recover by
+ * invalidating the context
+ */
+
+ if (ret) {
+ pr_fault(device, replay[i],
+ "gpu reset failed ctx %d ts %d\n",
+ replay[i]->context->id, replay[i]->timestamp);
+
+ adreno_drawctxt_invalidate(device, replay[i]->context);
+ remove_invalidated_cmdbatches(device, &replay[i],
+ count - i);
+ }
+ }
+
+ mutex_lock(&device->mutex);
+ kgsl_active_count_put(device);
+ mutex_unlock(&device->mutex);
+
+ kfree(replay);
+
+ return 1;
}
static inline int cmdbatch_consumed(struct kgsl_cmdbatch *cmdbatch,
@@ -626,6 +1011,30 @@
(timestamp_cmp(retired, cmdbatch->timestamp) < 0));
}
+static void _print_recovery(struct kgsl_device *device,
+ struct kgsl_cmdbatch *cmdbatch)
+{
+ static struct {
+ unsigned int mask;
+ const char *str;
+ } flags[] = { ADRENO_FT_TYPES };
+
+ int i, nr = find_first_bit(&cmdbatch->fault_recovery, BITS_PER_LONG);
+ char *result = "unknown";
+
+ for (i = 0; i < ARRAY_SIZE(flags); i++) {
+ if (flags[i].mask == BIT(nr)) {
+ result = (char *) flags[i].str;
+ break;
+ }
+ }
+
+ pr_fault(device, cmdbatch,
+ "gpu %s ctx %d ts %d policy %lX\n",
+ result, cmdbatch->context->id, cmdbatch->timestamp,
+ cmdbatch->fault_recovery);
+}
+
/**
* adreno_dispatcher_work() - Master work handler for the dispatcher
* @work: Pointer to the work struct for the current work queue
@@ -639,7 +1048,7 @@
struct adreno_device *adreno_dev =
container_of(dispatcher, struct adreno_device, dispatcher);
struct kgsl_device *device = &adreno_dev->dev;
- int inv, count = 0;
+ int count = 0;
mutex_lock(&dispatcher->mutex);
@@ -667,6 +1076,14 @@
if (kgsl_context_detached(cmdbatch->context) ||
(timestamp_cmp(cmdbatch->timestamp, retired) <= 0)) {
+ /*
+ * If the cmdbatch in question had faulted announce its
+ * successful completion to the world
+ */
+
+ if (cmdbatch->fault_recovery != 0)
+ _print_recovery(device, cmdbatch);
+
trace_adreno_cmdbatch_retired(cmdbatch,
dispatcher->inflight - 1);
@@ -693,8 +1110,6 @@
}
count++;
-
- BUG_ON(dispatcher->inflight == 0 && dispatcher->fault);
continue;
}
@@ -703,17 +1118,23 @@
* is to blame. Invalidate it, reset and replay
*/
- if (dispatcher->fault) {
- dispatcher_do_fault(device, cmdbatch, 1);
+ if (dispatcher_do_fault(device))
goto done;
- }
/* Get the last consumed timestamp */
consumed = kgsl_readtimestamp(device, cmdbatch->context,
KGSL_TIMESTAMP_CONSUMED);
- /* Break here if fault detection is disabled for the context */
- if (drawctxt->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE)
+ /*
+ * Break here if fault detection is disabled for the context or
+ * if the long running IB detection is disaled device wide
+ * Long running command buffers will be allowed to run to
+ * completion - but badly behaving command buffers (infinite
+ * shaders etc) can end up running forever.
+ */
+
+ if (!adreno_dev->long_ib_detect ||
+ drawctxt->flags & CTXT_FLAGS_NO_FAULT_TOLERANCE)
break;
/*
@@ -727,23 +1148,13 @@
/* Boom goes the dynamite */
- pr_err("-----------------------\n");
+ pr_fault(device, cmdbatch,
+ "gpu timeout ctx %d ts %d\n",
+ cmdbatch->context->id, cmdbatch->timestamp);
- pr_err("dispatcher: expired ctx=%d ts=%d consumed=%d retired=%d\n",
- cmdbatch->context->id, cmdbatch->timestamp, consumed,
- retired);
- pr_err("dispatcher: jiffies=%lu expired=%lu\n", jiffies,
- cmdbatch->expires);
+ adreno_set_gpu_fault(adreno_dev, ADRENO_TIMEOUT_FAULT);
- /*
- * If execution stopped after the current command batch was
- * consumed then invalidate the context for the current command
- * batch
- */
-
- inv = cmdbatch_consumed(cmdbatch, consumed, retired);
-
- dispatcher_do_fault(device, cmdbatch, inv);
+ dispatcher_do_fault(device);
break;
}
@@ -768,9 +1179,12 @@
struct kgsl_cmdbatch *cmdbatch
= dispatcher->cmdqueue[dispatcher->head];
+ /* Update the timeout timer for the next command batch */
mod_timer(&dispatcher->timer, cmdbatch->expires);
- } else
+ } else {
del_timer_sync(&dispatcher->timer);
+ del_timer_sync(&dispatcher->fault_timer);
+ }
/* Before leaving update the pwrscale information */
mutex_lock(&device->mutex);
@@ -788,6 +1202,60 @@
queue_work(device->work_queue, &dispatcher->work);
}
+/**
+ * adreno_dispatcher_queue_context() - schedule a drawctxt in the dispatcher
+ * device: pointer to the KGSL device
+ * drawctxt: pointer to the drawctxt to schedule
+ *
+ * Put a draw context on the dispatcher pending queue and schedule the
+ * dispatcher. This is used to reschedule changes that might have been blocked
+ * for sync points or other concerns
+ */
+void adreno_dispatcher_queue_context(struct kgsl_device *device,
+ struct adreno_context *drawctxt)
+{
+ struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+
+ dispatcher_queue_context(adreno_dev, drawctxt);
+ adreno_dispatcher_schedule(device);
+}
+
+/*
+ * This is called on a regular basis while command batches are inflight. Fault
+ * detection registers are read and compared to the existing values - if they
+ * changed then the GPU is still running. If they are the same between
+ * subsequent calls then the GPU may have faulted
+ */
+
+void adreno_dispatcher_fault_timer(unsigned long data)
+{
+ struct adreno_device *adreno_dev = (struct adreno_device *) data;
+ struct kgsl_device *device = &adreno_dev->dev;
+ struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
+
+ /* Leave if the user decided to turn off fast hang detection */
+ if (adreno_dev->fast_hang_detect == 0)
+ return;
+
+ if (adreno_gpu_fault(adreno_dev)) {
+ adreno_dispatcher_schedule(device);
+ return;
+ }
+
+ /*
+ * Read the fault registers - if it returns 0 then they haven't changed
+ * so mark the dispatcher as faulted and schedule the work loop.
+ */
+
+ if (!fault_detect_read_compare(device)) {
+ adreno_set_gpu_fault(adreno_dev, ADRENO_SOFT_FAULT);
+ adreno_dispatcher_schedule(device);
+ } else {
+ mod_timer(&dispatcher->fault_timer,
+ jiffies + msecs_to_jiffies(_fault_timer_interval));
+ }
+}
+
/*
* This is called when the timer expires - it either means the GPU is hung or
* the IB is taking too long to execute
@@ -800,18 +1268,15 @@
adreno_dispatcher_schedule(device);
}
/**
- * adreno_dispatcher_fault_irq() - Trigger a fault in the dispatcher
+ * adreno_dispatcher_irq_fault() - Trigger a fault in the dispatcher
* @device: Pointer to the KGSL device
*
* Called from an interrupt context this will trigger a fault in the
- * dispatcher
+ * dispatcher for the oldest pending command batch
*/
-void adreno_dispatcher_fault_irq(struct kgsl_device *device)
+void adreno_dispatcher_irq_fault(struct kgsl_device *device)
{
- struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
- struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
-
- dispatcher->fault = 1;
+ adreno_set_gpu_fault(ADRENO_DEVICE(device), ADRENO_HARD_FAULT);
adreno_dispatcher_schedule(device);
}
@@ -863,6 +1328,8 @@
struct adreno_dispatcher *dispatcher = &adreno_dev->dispatcher;
del_timer_sync(&dispatcher->timer);
+ del_timer_sync(&dispatcher->fault_timer);
+
dispatcher->state = ADRENO_DISPATCHER_PAUSE;
}
@@ -878,6 +1345,7 @@
mutex_lock(&dispatcher->mutex);
del_timer_sync(&dispatcher->timer);
+ del_timer_sync(&dispatcher->fault_timer);
while (dispatcher->head != dispatcher->tail) {
kgsl_cmdbatch_destroy(dispatcher->cmdqueue[dispatcher->head]);
@@ -953,6 +1421,8 @@
_context_cmdbatch_burst);
static DISPATCHER_UINT_ATTR(cmdbatch_timeout, 0644, 0, _cmdbatch_timeout);
static DISPATCHER_UINT_ATTR(context_queue_wait, 0644, 0, _context_queue_wait);
+static DISPATCHER_UINT_ATTR(fault_detect_interval, 0644, 0,
+ _fault_timer_interval);
static struct attribute *dispatcher_attrs[] = {
&dispatcher_attr_inflight.attr,
@@ -960,6 +1430,7 @@
&dispatcher_attr_context_burst_count.attr,
&dispatcher_attr_cmdbatch_timeout.attr,
&dispatcher_attr_context_queue_wait.attr,
+ &dispatcher_attr_fault_detect_interval.attr,
NULL,
};
@@ -1024,6 +1495,9 @@
setup_timer(&dispatcher->timer, adreno_dispatcher_timer,
(unsigned long) adreno_dev);
+ setup_timer(&dispatcher->fault_timer, adreno_dispatcher_fault_timer,
+ (unsigned long) adreno_dev);
+
INIT_WORK(&dispatcher->work, adreno_dispatcher_work);
plist_head_init(&dispatcher->pending);
diff --git a/drivers/gpu/msm/adreno_drawctxt.h b/drivers/gpu/msm/adreno_drawctxt.h
index f8469e2..486c70a 100644
--- a/drivers/gpu/msm/adreno_drawctxt.h
+++ b/drivers/gpu/msm/adreno_drawctxt.h
@@ -54,6 +54,8 @@
#define CTXT_FLAGS_SKIP_EOF BIT(15)
/* Context no fault tolerance */
#define CTXT_FLAGS_NO_FAULT_TOLERANCE BIT(16)
+/* Force the preamble for the next submission */
+#define CTXT_FLAGS_FORCE_PREAMBLE BIT(17)
/* Symbolic table for the adreno draw context type */
#define ADRENO_DRAWCTXT_TYPES \
diff --git a/drivers/gpu/msm/adreno_postmortem.c b/drivers/gpu/msm/adreno_postmortem.c
index 294ae76..8fb2830 100644
--- a/drivers/gpu/msm/adreno_postmortem.c
+++ b/drivers/gpu/msm/adreno_postmortem.c
@@ -396,8 +396,8 @@
int adreno_dump(struct kgsl_device *device, int manual)
{
- unsigned int cp_ib1_base, cp_ib1_bufsz;
- unsigned int cp_ib2_base, cp_ib2_bufsz;
+ unsigned int cp_ib1_base;
+ unsigned int cp_ib2_base;
phys_addr_t pt_base, cur_pt_base;
unsigned int cp_rb_base, cp_rb_ctrl, rb_count;
unsigned int cp_rb_wptr, cp_rb_rptr;
@@ -410,7 +410,6 @@
unsigned int ts_processed = 0xdeaddead;
struct kgsl_context *context;
unsigned int context_id;
- unsigned int rbbm_status;
static struct ib_list ib_list;
@@ -420,16 +419,10 @@
mb();
- if (device->pm_dump_enable) {
- msm_clk_dump_debug_info();
+ msm_clk_dump_debug_info();
- if (adreno_dev->gpudev->postmortem_dump)
- adreno_dev->gpudev->postmortem_dump(adreno_dev);
- }
-
- kgsl_regread(device,
- adreno_getreg(adreno_dev, ADRENO_REG_RBBM_STATUS),
- &rbbm_status);
+ if (adreno_dev->gpudev->postmortem_dump)
+ adreno_dev->gpudev->postmortem_dump(adreno_dev);
pt_base = kgsl_mmu_get_current_ptbase(&device->mmu);
cur_pt_base = pt_base;
@@ -451,29 +444,8 @@
adreno_getreg(adreno_dev, ADRENO_REG_CP_IB1_BASE),
&cp_ib1_base);
kgsl_regread(device,
- adreno_getreg(adreno_dev, ADRENO_REG_CP_IB1_BUFSZ),
- &cp_ib1_bufsz);
- kgsl_regread(device,
adreno_getreg(adreno_dev, ADRENO_REG_CP_IB2_BASE),
&cp_ib2_base);
- kgsl_regread(device,
- adreno_getreg(adreno_dev, ADRENO_REG_CP_IB2_BUFSZ),
- &cp_ib2_bufsz);
-
- trace_adreno_gpu_fault(rbbm_status, cp_rb_rptr, cp_rb_wptr,
- cp_ib1_base, cp_ib1_bufsz, cp_ib2_base, cp_ib2_bufsz);
-
- /* If postmortem dump is not enabled, dump minimal set and return */
- if (!device->pm_dump_enable) {
-
- KGSL_LOG_DUMP(device,
- "STATUS %08X | IB1:%08X/%08X | IB2: %08X/%08X"
- " | RPTR: %04X | WPTR: %04X\n",
- rbbm_status, cp_ib1_base, cp_ib1_bufsz, cp_ib2_base,
- cp_ib2_bufsz, cp_rb_rptr, cp_rb_wptr);
-
- return 0;
- }
kgsl_sharedmem_readl(&device->memstore,
(unsigned int *) &context_id,
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index dc1530a..1ad90fd 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -606,8 +606,8 @@
if (adreno_is_a20x(adreno_dev))
total_sizedwords += 2; /* CACHE_FLUSH */
- if (flags & KGSL_CMD_FLAGS_EOF)
- total_sizedwords += 2;
+ if (flags & KGSL_CMD_FLAGS_WFI)
+ total_sizedwords += 2; /* WFI */
if (profile_ready)
total_sizedwords += 6; /* space for pre_ib and post_ib */
@@ -737,6 +737,12 @@
GSL_RB_WRITE(rb->device, ringcmds, rcmd_gpu, 0);
}
+ if (flags & KGSL_CMD_FLAGS_WFI) {
+ GSL_RB_WRITE(rb->device, ringcmds, rcmd_gpu,
+ cp_type3_packet(CP_WAIT_FOR_IDLE, 1));
+ GSL_RB_WRITE(rb->device, ringcmds, rcmd_gpu, 0x00000000);
+ }
+
adreno_ringbuffer_submit(rb);
return 0;
@@ -1024,6 +1030,7 @@
struct kgsl_context *context;
struct adreno_context *drawctxt;
unsigned int start_index = 0;
+ int flags = KGSL_CMD_FLAGS_NONE;
int ret;
context = cmdbatch->context;
@@ -1039,10 +1046,23 @@
commands are stored in the first node of the IB chain. We can skip that
if a context switch hasn't occured */
- if (drawctxt->flags & CTXT_FLAGS_PREAMBLE &&
- adreno_dev->drawctxt_active == drawctxt)
+ if ((drawctxt->flags & CTXT_FLAGS_PREAMBLE) &&
+ !test_bit(CMDBATCH_FLAG_FORCE_PREAMBLE, &cmdbatch->priv) &&
+ (adreno_dev->drawctxt_active == drawctxt))
start_index = 1;
+ /*
+ * In skip mode don't issue the draw IBs but keep all the other
+ * accoutrements of a submision (including the interrupt) to keep
+ * the accounting sane. Set start_index and numibs to 0 to just
+ * generate the start and end markers and skip everything else
+ */
+
+ if (test_bit(CMDBATCH_FLAG_SKIP, &cmdbatch->priv)) {
+ start_index = 0;
+ numibs = 0;
+ }
+
cmds = link = kzalloc(sizeof(unsigned int) * (numibs * 3 + 4),
GFP_KERNEL);
if (!link) {
@@ -1061,7 +1081,17 @@
*cmds++ = ibdesc[0].sizedwords;
}
for (i = start_index; i < numibs; i++) {
- *cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
+
+ /*
+ * Skip 0 sized IBs - these are presumed to have been removed
+ * from consideration by the FT policy
+ */
+
+ if (ibdesc[i].sizedwords == 0)
+ *cmds++ = cp_nop_packet(2);
+ else
+ *cmds++ = CP_HDR_INDIRECT_BUFFER_PFD;
+
*cmds++ = ibdesc[i].gpuaddr;
*cmds++ = ibdesc[i].sizedwords;
}
@@ -1085,9 +1115,12 @@
if (ret)
goto done;
+ if (test_bit(CMDBATCH_FLAG_WFI, &cmdbatch->priv))
+ flags = KGSL_CMD_FLAGS_WFI;
+
ret = adreno_ringbuffer_addcmds(&adreno_dev->ringbuffer,
drawctxt,
- cmdbatch->flags,
+ flags,
&link[0], (cmds - link),
cmdbatch->timestamp);
diff --git a/drivers/gpu/msm/adreno_trace.h b/drivers/gpu/msm/adreno_trace.h
index 59aca2e..01194f4 100644
--- a/drivers/gpu/msm/adreno_trace.h
+++ b/drivers/gpu/msm/adreno_trace.h
@@ -61,14 +61,78 @@
)
);
-DEFINE_EVENT(adreno_cmdbatch_template, adreno_cmdbatch_retired,
+DEFINE_EVENT(adreno_cmdbatch_template, adreno_cmdbatch_submitted,
TP_PROTO(struct kgsl_cmdbatch *cmdbatch, int inflight),
TP_ARGS(cmdbatch, inflight)
);
-DEFINE_EVENT(adreno_cmdbatch_template, adreno_cmdbatch_submitted,
+TRACE_EVENT(adreno_cmdbatch_retired,
TP_PROTO(struct kgsl_cmdbatch *cmdbatch, int inflight),
- TP_ARGS(cmdbatch, inflight)
+ TP_ARGS(cmdbatch, inflight),
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(unsigned int, timestamp)
+ __field(unsigned int, inflight)
+ __field(unsigned int, recovery)
+ ),
+ TP_fast_assign(
+ __entry->id = cmdbatch->context->id;
+ __entry->timestamp = cmdbatch->timestamp;
+ __entry->inflight = inflight;
+ __entry->recovery = cmdbatch->fault_recovery;
+ ),
+ TP_printk(
+ "ctx=%u ts=%u inflight=%u recovery=%s",
+ __entry->id, __entry->timestamp,
+ __entry->inflight,
+ __entry->recovery ?
+ __print_flags(__entry->recovery, "|",
+ ADRENO_FT_TYPES) : "none"
+ )
+);
+
+TRACE_EVENT(adreno_cmdbatch_fault,
+ TP_PROTO(struct kgsl_cmdbatch *cmdbatch, unsigned int fault),
+ TP_ARGS(cmdbatch, fault),
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(unsigned int, timestamp)
+ __field(unsigned int, fault)
+ ),
+ TP_fast_assign(
+ __entry->id = cmdbatch->context->id;
+ __entry->timestamp = cmdbatch->timestamp;
+ __entry->fault = fault;
+ ),
+ TP_printk(
+ "ctx=%u ts=%u type=%s",
+ __entry->id, __entry->timestamp,
+ __print_symbolic(__entry->fault,
+ { 0, "none" },
+ { ADRENO_SOFT_FAULT, "soft" },
+ { ADRENO_HARD_FAULT, "hard" },
+ { ADRENO_TIMEOUT_FAULT, "timeout" })
+ )
+);
+
+TRACE_EVENT(adreno_cmdbatch_recovery,
+ TP_PROTO(struct kgsl_cmdbatch *cmdbatch, unsigned int action),
+ TP_ARGS(cmdbatch, action),
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(unsigned int, timestamp)
+ __field(unsigned int, action)
+ ),
+ TP_fast_assign(
+ __entry->id = cmdbatch->context->id;
+ __entry->timestamp = cmdbatch->timestamp;
+ __entry->action = action;
+ ),
+ TP_printk(
+ "ctx=%u ts=%u action=%s",
+ __entry->id, __entry->timestamp,
+ __print_symbolic(__entry->action, ADRENO_FT_TYPES)
+ )
);
DECLARE_EVENT_CLASS(adreno_drawctxt_template,
@@ -140,11 +204,15 @@
);
TRACE_EVENT(adreno_gpu_fault,
- TP_PROTO(unsigned int status, unsigned int rptr, unsigned int wptr,
+ TP_PROTO(unsigned int ctx, unsigned int ts,
+ unsigned int status, unsigned int rptr, unsigned int wptr,
unsigned int ib1base, unsigned int ib1size,
unsigned int ib2base, unsigned int ib2size),
- TP_ARGS(status, rptr, wptr, ib1base, ib1size, ib2base, ib2size),
+ TP_ARGS(ctx, ts, status, rptr, wptr, ib1base, ib1size, ib2base,
+ ib2size),
TP_STRUCT__entry(
+ __field(unsigned int, ctx)
+ __field(unsigned int, ts)
__field(unsigned int, status)
__field(unsigned int, rptr)
__field(unsigned int, wptr)
@@ -154,6 +222,8 @@
__field(unsigned int, ib2size)
),
TP_fast_assign(
+ __entry->ctx = ctx;
+ __entry->ts = ts;
__entry->status = status;
__entry->rptr = rptr;
__entry->wptr = wptr;
@@ -162,10 +232,10 @@
__entry->ib2base = ib2base;
__entry->ib2size = ib2size;
),
- TP_printk("status=%X RB=%X/%X IB1=%X/%X IB2=%X/%X",
- __entry->status, __entry->wptr, __entry->rptr,
- __entry->ib1base, __entry->ib1size, __entry->ib2base,
- __entry->ib2size)
+ TP_printk("ctx=%d ts=%d status=%X RB=%X/%X IB1=%X/%X IB2=%X/%X",
+ __entry->ctx, __entry->ts, __entry->status, __entry->wptr,
+ __entry->rptr, __entry->ib1base, __entry->ib1size,
+ __entry->ib2base, __entry->ib2size)
);
#endif /* _ADRENO_TRACE_H */
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index f5b27d0..f1a97ed 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -160,8 +160,12 @@
/**
* struct kgsl_cmdbatch - KGSl command descriptor
* @context: KGSL context that created the command
- * @timestamp: Timestamp assigned to the command (currently unused)
+ * @timestamp: Timestamp assigned to the command
* @flags: flags
+ * @priv: Internal flags
+ * @fault_policy: Internal policy describing how to handle this command in case
+ * of a fault
+ * @fault_recovery: recovery actions actually tried for this batch
* @ibcount: Number of IBs in the command list
* @ibdesc: Pointer to the list of IBs
* @expires: Point in time when the cmdbatch is considered to be hung
@@ -172,12 +176,28 @@
struct kgsl_context *context;
uint32_t timestamp;
uint32_t flags;
+ unsigned long priv;
+ unsigned long fault_policy;
+ unsigned long fault_recovery;
uint32_t ibcount;
struct kgsl_ibdesc *ibdesc;
unsigned long expires;
int invalid;
};
+/**
+ * enum kgsl_cmdbatch_priv - Internal cmdbatch flags
+ * @CMDBATCH_FLAG_SKIP - skip the entire command batch
+ * @CMDBATCH_FLAG_FORCE_PREAMBLE - Force the preamble on for the cmdbatch
+ * @CMDBATCH_FLAG_WFI - Force wait-for-idle for the submission
+ */
+
+enum kgsl_cmdbatch_priv {
+ CMDBATCH_FLAG_SKIP = 0,
+ CMDBATCH_FLAG_FORCE_PREAMBLE,
+ CMDBATCH_FLAG_WFI,
+};
+
struct kgsl_device {
struct device *dev;
const char *name;
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index 103736d..e296784 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -423,8 +423,12 @@
* the GPU and trigger a snapshot. To stall the transaction return
* EBUSY error.
*/
- if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_GPUHALT_ENABLE)
+ if (adreno_dev->ft_pf_policy & KGSL_FT_PAGEFAULT_GPUHALT_ENABLE) {
+ /* turn off GPU IRQ so we don't get faults from it too */
+ kgsl_pwrctrl_irq(device, KGSL_PWRFLAGS_OFF);
+ adreno_dispatcher_irq_fault(device);
ret = -EBUSY;
+ }
done:
return ret;
}
diff --git a/include/linux/msm_kgsl.h b/include/linux/msm_kgsl.h
index 87047d2..1eb346b 100644
--- a/include/linux/msm_kgsl.h
+++ b/include/linux/msm_kgsl.h
@@ -20,6 +20,8 @@
#define KGSL_CONTEXT_TRASH_STATE 0x00000020
#define KGSL_CONTEXT_PER_CONTEXT_TS 0x00000040
#define KGSL_CONTEXT_USER_GENERATED_TS 0x00000080
+#define KGSL_CONTEXT_END_OF_FRAME 0x00000100
+
#define KGSL_CONTEXT_NO_FAULT_TOLERANCE 0x00000200
/* bits [12:15] are reserved for future use */
#define KGSL_CONTEXT_TYPE_MASK 0x01F00000