msm: kgsl: Add A6XX preemption support

This patch adds the basic a6xx preemption code, which
includes:
1. New PM4 packets sequences which inform the CP of the
outgoing and the incoming contexts.
2. The preemption initialization sequence.
3. Definitions of preemption related registers.
4. The fast preemption code.

Change-Id: I37ae9d6cdd3d6c7fd5cb635502f5535713042732
Signed-off-by: Harshdeep Dhatt <hdhatt@codeaurora.org>
diff --git a/drivers/gpu/msm/Makefile b/drivers/gpu/msm/Makefile
index f513207..0058226 100644
--- a/drivers/gpu/msm/Makefile
+++ b/drivers/gpu/msm/Makefile
@@ -38,6 +38,7 @@
 	adreno_a6xx_snapshot.o \
 	adreno_a4xx_preempt.o \
 	adreno_a5xx_preempt.o \
+	adreno_a6xx_preempt.o \
 	adreno_sysfs.o \
 	adreno.o \
 	adreno_cp_parser.o \
diff --git a/drivers/gpu/msm/a6xx_reg.h b/drivers/gpu/msm/a6xx_reg.h
index 58ef5ee..f4552b6 100644
--- a/drivers/gpu/msm/a6xx_reg.h
+++ b/drivers/gpu/msm/a6xx_reg.h
@@ -70,6 +70,15 @@
 #define A6XX_CP_ADDR_MODE_CNTL           0x842
 #define A6XX_CP_PROTECT_CNTL             0x84F
 #define A6XX_CP_PROTECT_REG              0x850
+#define A6XX_CP_CONTEXT_SWITCH_CNTL      0x8A0
+#define A6XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO   0x8A1
+#define A6XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI   0x8A2
+#define A6XX_CP_CONTEXT_SWITCH_PRIV_NON_SECURE_RESTORE_ADDR_LO   0x8A3
+#define A6XX_CP_CONTEXT_SWITCH_PRIV_NON_SECURE_RESTORE_ADDR_HI   0x8A4
+#define A6XX_CP_CONTEXT_SWITCH_PRIV_SECURE_RESTORE_ADDR_LO   0x8A5
+#define A6XX_CP_CONTEXT_SWITCH_PRIV_SECURE_RESTORE_ADDR_HI   0x8A6
+#define A6XX_CP_CONTEXT_SWITCH_NON_PRIV_RESTORE_ADDR_LO   0x8A7
+#define A6XX_CP_CONTEXT_SWITCH_NON_PRIV_RESTORE_ADDR_HI   0x8A8
 #define A6XX_CP_PERFCTR_CP_SEL_0         0x8D0
 #define A6XX_CP_PERFCTR_CP_SEL_1         0x8D1
 #define A6XX_CP_PERFCTR_CP_SEL_2         0x8D2
@@ -590,6 +599,7 @@
 #define A6XX_RB_PERFCTR_CMP_SEL_1           0x8E2D
 #define A6XX_RB_PERFCTR_CMP_SEL_2           0x8E2E
 #define A6XX_RB_PERFCTR_CMP_SEL_3           0x8E2F
+#define A6XX_RB_CONTEXT_SWITCH_GMEM_SAVE_RESTORE 0x8E50
 
 /* PC registers */
 #define A6XX_PC_DBG_ECO_CNTL                0x9E00
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index 7a6581c..b3b4ccb 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -845,6 +845,7 @@
 				unsigned int *cmds,
 				struct kgsl_context *context);
 	int (*preemption_yield_enable)(unsigned int *);
+	unsigned int (*preemption_set_marker)(unsigned int *cmds, int start);
 	unsigned int (*preemption_post_ibsubmit)(
 				struct adreno_device *adreno_dev,
 				unsigned int *cmds);
diff --git a/drivers/gpu/msm/adreno_a6xx.c b/drivers/gpu/msm/adreno_a6xx.c
index 3cbb68e..2c46b93 100644
--- a/drivers/gpu/msm/adreno_a6xx.c
+++ b/drivers/gpu/msm/adreno_a6xx.c
@@ -29,9 +29,6 @@
 #include "kgsl_gmu.h"
 #include "kgsl_trace.h"
 
-#define A6XX_CP_RB_CNTL_DEFAULT (((ilog2(4) << 8) & 0x1F00) | \
-		(ilog2(KGSL_RB_DWORDS >> 1) & 0x3F))
-
 #define MIN_HBB		13
 
 #define A6XX_LLC_NUM_GPU_SCIDS		5
@@ -482,6 +479,12 @@
 	if (ADRENO_QUIRK(adreno_dev, ADRENO_QUIRK_TWO_PASS_USE_WFI))
 		kgsl_regrmw(device, A6XX_PC_DBG_ECO_CNTL, 0, (1 << 8));
 
+	/* Enable the GMEM save/restore feature for preemption */
+	if (adreno_is_preemption_enabled(adreno_dev))
+		kgsl_regwrite(device, A6XX_RB_CONTEXT_SWITCH_GMEM_SAVE_RESTORE,
+			0x1);
+
+	a6xx_preemption_start(adreno_dev);
 	a6xx_protect_init(adreno_dev);
 }
 
@@ -612,6 +615,70 @@
 }
 
 /*
+ * Follow the ME_INIT sequence with a preemption yield to allow the GPU to move
+ * to a different ringbuffer, if desired
+ */
+static int _preemption_init(struct adreno_device *adreno_dev,
+		struct adreno_ringbuffer *rb, unsigned int *cmds,
+		struct kgsl_context *context)
+{
+	unsigned int *cmds_orig = cmds;
+
+	/* Turn CP protection OFF */
+	*cmds++ = cp_type7_packet(CP_SET_PROTECTED_MODE, 1);
+	*cmds++ = 0;
+
+	*cmds++ = cp_type7_packet(CP_SET_PSEUDO_REGISTER, 6);
+	*cmds++ = 1;
+	cmds += cp_gpuaddr(adreno_dev, cmds,
+			rb->preemption_desc.gpuaddr);
+
+	*cmds++ = 2;
+	cmds += cp_gpuaddr(adreno_dev, cmds, 0);
+
+	/* Turn CP protection ON */
+	*cmds++ = cp_type7_packet(CP_SET_PROTECTED_MODE, 1);
+	*cmds++ = 1;
+
+	*cmds++ = cp_type7_packet(CP_CONTEXT_SWITCH_YIELD, 4);
+	cmds += cp_gpuaddr(adreno_dev, cmds, 0x0);
+	*cmds++ = 0;
+	/* generate interrupt on preemption completion */
+	*cmds++ = 0;
+
+	return cmds - cmds_orig;
+}
+
+static int a6xx_post_start(struct adreno_device *adreno_dev)
+{
+	int ret;
+	unsigned int *cmds, *start;
+	struct adreno_ringbuffer *rb = adreno_dev->cur_rb;
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+
+	if (!adreno_is_preemption_enabled(adreno_dev))
+		return 0;
+
+	cmds = adreno_ringbuffer_allocspace(rb, 42);
+	if (IS_ERR(cmds)) {
+		KGSL_DRV_ERR(device, "error allocating preemption init cmds");
+		return PTR_ERR(cmds);
+	}
+	start = cmds;
+
+	cmds += _preemption_init(adreno_dev, rb, cmds, NULL);
+
+	rb->_wptr = rb->_wptr - (42 - (cmds - start));
+
+	ret = adreno_ringbuffer_submit_spin(rb, NULL, 2000);
+	if (ret)
+		adreno_spin_idle_debug(adreno_dev,
+			"hw preemption initialization failed to idle\n");
+
+	return ret;
+}
+
+/*
  * a6xx_rb_start() - Start the ringbuffer
  * @adreno_dev: Pointer to adreno device
  * @start_type: Warm or cold start
@@ -651,7 +718,11 @@
 		return ret;
 
 	/* GPU comes up in secured mode, make it unsecured by default */
-	return adreno_set_unsecured_mode(adreno_dev, rb);
+	ret = adreno_set_unsecured_mode(adreno_dev, rb);
+	if (ret)
+		return ret;
+
+	return a6xx_post_start(adreno_dev);
 }
 
 static int _load_firmware(struct kgsl_device *device, const char *fwfile,
@@ -2086,7 +2157,7 @@
 	/* 6 - RBBM_ATB_ASYNC_OVERFLOW */
 	ADRENO_IRQ_CALLBACK(a6xx_err_callback),
 	ADRENO_IRQ_CALLBACK(NULL), /* 7 - GPC_ERR */
-	ADRENO_IRQ_CALLBACK(NULL),/* 8 - CP_SW */
+	ADRENO_IRQ_CALLBACK(a6xx_preemption_callback),/* 8 - CP_SW */
 	ADRENO_IRQ_CALLBACK(a6xx_cp_hw_err_callback), /* 9 - CP_HW_ERROR */
 	ADRENO_IRQ_CALLBACK(NULL),  /* 10 - CP_CCU_FLUSH_DEPTH_TS */
 	ADRENO_IRQ_CALLBACK(NULL), /* 11 - CP_CCU_FLUSH_COLOR_TS */
@@ -2580,6 +2651,11 @@
 	ADRENO_REG_DEFINE(ADRENO_REG_CP_IB2_BUFSZ, A6XX_CP_IB2_REM_SIZE),
 	ADRENO_REG_DEFINE(ADRENO_REG_CP_ROQ_ADDR, A6XX_CP_ROQ_DBG_ADDR),
 	ADRENO_REG_DEFINE(ADRENO_REG_CP_ROQ_DATA, A6XX_CP_ROQ_DBG_DATA),
+	ADRENO_REG_DEFINE(ADRENO_REG_CP_PREEMPT, A6XX_CP_CONTEXT_SWITCH_CNTL),
+	ADRENO_REG_DEFINE(ADRENO_REG_CP_CONTEXT_SWITCH_SMMU_INFO_LO,
+			A6XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO),
+	ADRENO_REG_DEFINE(ADRENO_REG_CP_CONTEXT_SWITCH_SMMU_INFO_HI,
+			A6XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI),
 	ADRENO_REG_DEFINE(ADRENO_REG_RBBM_STATUS, A6XX_RBBM_STATUS),
 	ADRENO_REG_DEFINE(ADRENO_REG_RBBM_STATUS3, A6XX_RBBM_STATUS3),
 	ADRENO_REG_DEFINE(ADRENO_REG_RBBM_PERFCTR_CTL, A6XX_RBBM_PERFCTR_CNTL),
@@ -2693,4 +2769,9 @@
 	.iommu_fault_block = a6xx_iommu_fault_block,
 	.reset = a6xx_reset,
 	.soft_reset = a6xx_soft_reset,
+	.preemption_pre_ibsubmit = a6xx_preemption_pre_ibsubmit,
+	.preemption_post_ibsubmit = a6xx_preemption_post_ibsubmit,
+	.preemption_init = a6xx_preemption_init,
+	.preemption_schedule = a6xx_preemption_schedule,
+	.preemption_set_marker = a6xx_preemption_set_marker,
 };
diff --git a/drivers/gpu/msm/adreno_a6xx.h b/drivers/gpu/msm/adreno_a6xx.h
index 4b96f56..2738238 100644
--- a/drivers/gpu/msm/adreno_a6xx.h
+++ b/drivers/gpu/msm/adreno_a6xx.h
@@ -23,10 +23,87 @@
 #define CP_CLUSTER_SP_PS	0x4
 #define CP_CLUSTER_PS		0x5
 
+/**
+ * struct a6xx_cp_preemption_record - CP context record for
+ * preemption.
+ * @magic: (00) Value at this offset must be equal to
+ * A6XX_CP_CTXRECORD_MAGIC_REF.
+ * @info: (04) Type of record. Written non-zero (usually) by CP.
+ * we must set to zero for all ringbuffers.
+ * @errno: (08) Error code. Initialize this to A6XX_CP_CTXRECORD_ERROR_NONE.
+ * CP will update to another value if a preemption error occurs.
+ * @data: (12) DATA field in YIELD and SET_MARKER packets.
+ * Written by CP when switching out. Not used on switch-in. Initialized to 0.
+ * @cntl: (16) RB_CNTL, saved and restored by CP. We must initialize this.
+ * @rptr: (20) RB_RPTR, saved and restored by CP. We must initialize this.
+ * @wptr: (24) RB_WPTR, saved and restored by CP. We must initialize this.
+ * @_pad28: (28) Reserved/padding.
+ * @rptr_addr: (32) RB_RPTR_ADDR_LO|HI saved and restored. We must initialize.
+ * rbase: (40) RB_BASE_LO|HI saved and restored.
+ * counter: (48) Pointer to preemption counter.
+ */
+struct a6xx_cp_preemption_record {
+	uint32_t  magic;
+	uint32_t  info;
+	uint32_t  errno;
+	uint32_t  data;
+	uint32_t  cntl;
+	uint32_t  rptr;
+	uint32_t  wptr;
+	uint32_t  _pad28;
+	uint64_t  rptr_addr;
+	uint64_t  rbase;
+	uint64_t  counter;
+};
+
+/**
+ * struct a6xx_cp_smmu_info - CP preemption SMMU info.
+ * @magic: (00) The value at this offset must be equal to
+ * A6XX_CP_SMMU_INFO_MAGIC_REF.
+ * @_pad4: (04) Reserved/padding
+ * @ttbr0: (08) Base address of the page table for the
+ * incoming context.
+ * @context_idr: (16) Context Identification Register value.
+ */
+struct a6xx_cp_smmu_info {
+	uint32_t  magic;
+	uint32_t  _pad4;
+	uint64_t  ttbr0;
+	uint32_t  asid;
+	uint32_t  context_idr;
+};
+
+#define A6XX_CP_SMMU_INFO_MAGIC_REF     0x3618CDA3UL
+
+#define A6XX_CP_CTXRECORD_MAGIC_REF     0xAE399D6EUL
+/* Size of each CP preemption record */
+#define A6XX_CP_CTXRECORD_SIZE_IN_BYTES     (2112 * 1024)
+/* Size of the preemption counter block (in bytes) */
+#define A6XX_CP_CTXRECORD_PREEMPTION_COUNTER_SIZE   (16 * 4)
+/* Size of the performance counter save/restore block (in bytes) */
+#define A6XX_CP_PERFCOUNTER_SAVE_RESTORE_SIZE   (4 * 1024)
+
+#define A6XX_CP_RB_CNTL_DEFAULT (((ilog2(4) << 8) & 0x1F00) | \
+		(ilog2(KGSL_RB_DWORDS >> 1) & 0x3F))
+
+/* Preemption functions */
+void a6xx_preemption_trigger(struct adreno_device *adreno_dev);
+void a6xx_preemption_schedule(struct adreno_device *adreno_dev);
+void a6xx_preemption_start(struct adreno_device *adreno_dev);
+int a6xx_preemption_init(struct adreno_device *adreno_dev);
+
+unsigned int a6xx_preemption_post_ibsubmit(struct adreno_device *adreno_dev,
+		unsigned int *cmds);
+unsigned int a6xx_preemption_pre_ibsubmit(struct adreno_device *adreno_dev,
+		struct adreno_ringbuffer *rb,
+		unsigned int *cmds, struct kgsl_context *context);
+
+unsigned int a6xx_preemption_set_marker(unsigned int *cmds, int start);
+
+void a6xx_preemption_callback(struct adreno_device *adreno_dev, int bit);
 
 void a6xx_snapshot(struct adreno_device *adreno_dev,
 		struct kgsl_snapshot *snapshot);
 
 void a6xx_crashdump_init(struct adreno_device *adreno_dev);
-
 #endif
diff --git a/drivers/gpu/msm/adreno_a6xx_preempt.c b/drivers/gpu/msm/adreno_a6xx_preempt.c
new file mode 100644
index 0000000..c37791a
--- /dev/null
+++ b/drivers/gpu/msm/adreno_a6xx_preempt.c
@@ -0,0 +1,602 @@
+/* Copyright (c) 2017, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "adreno.h"
+#include "adreno_a6xx.h"
+#include "a6xx_reg.h"
+#include "adreno_trace.h"
+#include "adreno_pm4types.h"
+
+#define PREEMPT_RECORD(_field) \
+		offsetof(struct a6xx_cp_preemption_record, _field)
+
+#define PREEMPT_SMMU_RECORD(_field) \
+		offsetof(struct a6xx_cp_smmu_info, _field)
+
+enum {
+	SET_PSEUDO_REGISTER_SAVE_REGISTER_SMMU_INFO = 0,
+	SET_PSEUDO_REGISTER_SAVE_REGISTER_PRIV_NON_SECURE_SAVE_ADDR,
+	SET_PSEUDO_REGISTER_SAVE_REGISTER_PRIV_SECURE_SAVE_ADDR,
+	SET_PSEUDO_REGISTER_SAVE_REGISTER_NON_PRIV_SAVE_ADDR,
+	SET_PSEUDO_REGISTER_SAVE_REGISTER_COUNTER,
+};
+
+static void _update_wptr(struct adreno_device *adreno_dev, bool reset_timer)
+{
+	struct adreno_ringbuffer *rb = adreno_dev->cur_rb;
+	unsigned int wptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rb->preempt_lock, flags);
+
+	adreno_readreg(adreno_dev, ADRENO_REG_CP_RB_WPTR, &wptr);
+
+	if (wptr != rb->wptr) {
+		adreno_writereg(adreno_dev, ADRENO_REG_CP_RB_WPTR,
+			rb->wptr);
+		/*
+		 * In case something got submitted while preemption was on
+		 * going, reset the timer.
+		 */
+		reset_timer = true;
+	}
+
+	if (reset_timer)
+		rb->dispatch_q.expires = jiffies +
+			msecs_to_jiffies(adreno_drawobj_timeout);
+
+	spin_unlock_irqrestore(&rb->preempt_lock, flags);
+}
+
+static inline bool adreno_move_preempt_state(struct adreno_device *adreno_dev,
+	enum adreno_preempt_states old, enum adreno_preempt_states new)
+{
+	return (atomic_cmpxchg(&adreno_dev->preempt.state, old, new) == old);
+}
+
+static void _a6xx_preemption_done(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	unsigned int status;
+
+	/*
+	 * In the very unlikely case that the power is off, do nothing - the
+	 * state will be reset on power up and everybody will be happy
+	 */
+
+	if (!kgsl_state_is_awake(device))
+		return;
+
+	adreno_readreg(adreno_dev, ADRENO_REG_CP_PREEMPT, &status);
+
+	if (status & 0x1) {
+		KGSL_DRV_ERR(device,
+			"Preemption not complete: status=%X cur=%d R/W=%X/%X next=%d R/W=%X/%X\n",
+			status, adreno_dev->cur_rb->id,
+			adreno_get_rptr(adreno_dev->cur_rb),
+			adreno_dev->cur_rb->wptr, adreno_dev->next_rb->id,
+			adreno_get_rptr(adreno_dev->next_rb),
+			adreno_dev->next_rb->wptr);
+
+		/* Set a fault and restart */
+		adreno_set_gpu_fault(adreno_dev, ADRENO_PREEMPT_FAULT);
+		adreno_dispatcher_schedule(device);
+
+		return;
+	}
+
+	del_timer_sync(&adreno_dev->preempt.timer);
+
+	trace_adreno_preempt_done(adreno_dev->cur_rb, adreno_dev->next_rb);
+
+	/* Clean up all the bits */
+	adreno_dev->prev_rb = adreno_dev->cur_rb;
+	adreno_dev->cur_rb = adreno_dev->next_rb;
+	adreno_dev->next_rb = NULL;
+
+	/* Update the wptr for the new command queue */
+	_update_wptr(adreno_dev, true);
+
+	/* Update the dispatcher timer for the new command queue */
+	mod_timer(&adreno_dev->dispatcher.timer,
+		adreno_dev->cur_rb->dispatch_q.expires);
+
+	/* Clear the preempt state */
+	adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_NONE);
+}
+
+static void _a6xx_preemption_fault(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	unsigned int status;
+
+	/*
+	 * If the power is on check the preemption status one more time - if it
+	 * was successful then just transition to the complete state
+	 */
+	if (kgsl_state_is_awake(device)) {
+		adreno_readreg(adreno_dev, ADRENO_REG_CP_PREEMPT, &status);
+
+		if (status == 0) {
+			adreno_set_preempt_state(adreno_dev,
+				ADRENO_PREEMPT_COMPLETE);
+
+			adreno_dispatcher_schedule(device);
+			return;
+		}
+	}
+
+	KGSL_DRV_ERR(device,
+		"Preemption timed out: cur=%d R/W=%X/%X, next=%d R/W=%X/%X\n",
+		adreno_dev->cur_rb->id,
+		adreno_get_rptr(adreno_dev->cur_rb), adreno_dev->cur_rb->wptr,
+		adreno_dev->next_rb->id,
+		adreno_get_rptr(adreno_dev->next_rb),
+		adreno_dev->next_rb->wptr);
+
+	adreno_set_gpu_fault(adreno_dev, ADRENO_PREEMPT_FAULT);
+	adreno_dispatcher_schedule(device);
+}
+
+static void _a6xx_preemption_worker(struct work_struct *work)
+{
+	struct adreno_preemption *preempt = container_of(work,
+		struct adreno_preemption, work);
+	struct adreno_device *adreno_dev = container_of(preempt,
+		struct adreno_device, preempt);
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+
+	/* Need to take the mutex to make sure that the power stays on */
+	mutex_lock(&device->mutex);
+
+	if (adreno_in_preempt_state(adreno_dev, ADRENO_PREEMPT_FAULTED))
+		_a6xx_preemption_fault(adreno_dev);
+
+	mutex_unlock(&device->mutex);
+}
+
+static void _a6xx_preemption_timer(unsigned long data)
+{
+	struct adreno_device *adreno_dev = (struct adreno_device *) data;
+
+	/* We should only be here from a triggered state */
+	if (!adreno_move_preempt_state(adreno_dev,
+		ADRENO_PREEMPT_TRIGGERED, ADRENO_PREEMPT_FAULTED))
+		return;
+
+	/* Schedule the worker to take care of the details */
+	queue_work(system_unbound_wq, &adreno_dev->preempt.work);
+}
+
+/* Find the highest priority active ringbuffer */
+static struct adreno_ringbuffer *a6xx_next_ringbuffer(
+		struct adreno_device *adreno_dev)
+{
+	struct adreno_ringbuffer *rb;
+	unsigned long flags;
+	unsigned int i;
+
+	FOR_EACH_RINGBUFFER(adreno_dev, rb, i) {
+		bool empty;
+
+		spin_lock_irqsave(&rb->preempt_lock, flags);
+		empty = adreno_rb_empty(rb);
+		spin_unlock_irqrestore(&rb->preempt_lock, flags);
+
+		if (empty == false)
+			return rb;
+	}
+
+	return NULL;
+}
+
+void a6xx_preemption_trigger(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	struct kgsl_iommu *iommu = KGSL_IOMMU_PRIV(device);
+	struct adreno_ringbuffer *next;
+	uint64_t ttbr0;
+	unsigned int contextidr;
+	unsigned long flags;
+	uint32_t preempt_level = 0, usesgmem = 1, skipsaverestore = 0;
+
+	/* Put ourselves into a possible trigger state */
+	if (!adreno_move_preempt_state(adreno_dev,
+		ADRENO_PREEMPT_NONE, ADRENO_PREEMPT_START))
+		return;
+
+	/* Get the next ringbuffer to preempt in */
+	next = a6xx_next_ringbuffer(adreno_dev);
+
+	/*
+	 * Nothing to do if every ringbuffer is empty or if the current
+	 * ringbuffer is the only active one
+	 */
+	if (next == NULL || next == adreno_dev->cur_rb) {
+		/*
+		 * Update any critical things that might have been skipped while
+		 * we were looking for a new ringbuffer
+		 */
+
+		if (next != NULL) {
+			_update_wptr(adreno_dev, false);
+
+			mod_timer(&adreno_dev->dispatcher.timer,
+				adreno_dev->cur_rb->dispatch_q.expires);
+		}
+
+		adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_NONE);
+		return;
+	}
+
+	/* Turn off the dispatcher timer */
+	del_timer(&adreno_dev->dispatcher.timer);
+
+	/*
+	 * This is the most critical section - we need to take care not to race
+	 * until we have programmed the CP for the switch
+	 */
+
+	spin_lock_irqsave(&next->preempt_lock, flags);
+
+	/*
+	 * Get the pagetable from the pagetable info.
+	 * The pagetable_desc is allocated and mapped at probe time, and
+	 * preemption_desc at init time, so no need to check if
+	 * sharedmem accesses to these memdescs succeed.
+	 */
+	kgsl_sharedmem_readq(&next->pagetable_desc, &ttbr0,
+		PT_INFO_OFFSET(ttbr0));
+	kgsl_sharedmem_readl(&next->pagetable_desc, &contextidr,
+		PT_INFO_OFFSET(contextidr));
+
+	kgsl_sharedmem_writel(device, &next->preemption_desc,
+		PREEMPT_RECORD(wptr), next->wptr);
+
+	spin_unlock_irqrestore(&next->preempt_lock, flags);
+
+	/* And write it to the smmu info */
+	kgsl_sharedmem_writeq(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(ttbr0), ttbr0);
+	kgsl_sharedmem_writel(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(context_idr), contextidr);
+
+	kgsl_regwrite(device,
+		A6XX_CP_CONTEXT_SWITCH_PRIV_NON_SECURE_RESTORE_ADDR_LO,
+		lower_32_bits(next->preemption_desc.gpuaddr));
+	kgsl_regwrite(device,
+		A6XX_CP_CONTEXT_SWITCH_PRIV_NON_SECURE_RESTORE_ADDR_HI,
+		upper_32_bits(next->preemption_desc.gpuaddr));
+
+	adreno_dev->next_rb = next;
+
+	/* Start the timer to detect a stuck preemption */
+	mod_timer(&adreno_dev->preempt.timer,
+		jiffies + msecs_to_jiffies(ADRENO_PREEMPT_TIMEOUT));
+
+	trace_adreno_preempt_trigger(adreno_dev->cur_rb, adreno_dev->next_rb);
+
+	adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_TRIGGERED);
+
+	/* Trigger the preemption */
+	adreno_writereg(adreno_dev, ADRENO_REG_CP_PREEMPT,
+			((preempt_level << 6) & 0xC0) |
+			((skipsaverestore << 9) & 0x200) |
+			((usesgmem << 8) & 0x100) | 0x1);
+}
+
+void a6xx_preemption_callback(struct adreno_device *adreno_dev, int bit)
+{
+	unsigned int status;
+
+	if (!adreno_move_preempt_state(adreno_dev,
+		ADRENO_PREEMPT_TRIGGERED, ADRENO_PREEMPT_PENDING))
+		return;
+
+	adreno_readreg(adreno_dev, ADRENO_REG_CP_PREEMPT, &status);
+
+	if (status & 0x1) {
+		KGSL_DRV_ERR(KGSL_DEVICE(adreno_dev),
+			"preempt interrupt with non-zero status: %X\n", status);
+
+		/*
+		 * Under the assumption that this is a race between the
+		 * interrupt and the register, schedule the worker to clean up.
+		 * If the status still hasn't resolved itself by the time we get
+		 * there then we have to assume something bad happened
+		 */
+		adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_COMPLETE);
+		adreno_dispatcher_schedule(KGSL_DEVICE(adreno_dev));
+		return;
+	}
+
+	del_timer(&adreno_dev->preempt.timer);
+
+	trace_adreno_preempt_done(adreno_dev->cur_rb,
+		adreno_dev->next_rb);
+
+	adreno_dev->prev_rb = adreno_dev->cur_rb;
+	adreno_dev->cur_rb = adreno_dev->next_rb;
+	adreno_dev->next_rb = NULL;
+
+	/* Update the wptr if it changed while preemption was ongoing */
+	_update_wptr(adreno_dev, true);
+
+	/* Update the dispatcher timer for the new command queue */
+	mod_timer(&adreno_dev->dispatcher.timer,
+		adreno_dev->cur_rb->dispatch_q.expires);
+
+	adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_NONE);
+
+	a6xx_preemption_trigger(adreno_dev);
+}
+
+void a6xx_preemption_schedule(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+
+	if (!adreno_is_preemption_enabled(adreno_dev))
+		return;
+
+	mutex_lock(&device->mutex);
+
+	if (adreno_in_preempt_state(adreno_dev, ADRENO_PREEMPT_COMPLETE))
+		_a6xx_preemption_done(adreno_dev);
+
+	a6xx_preemption_trigger(adreno_dev);
+
+	mutex_unlock(&device->mutex);
+}
+
+unsigned int a6xx_preemption_set_marker(unsigned int *cmds, int start)
+{
+	*cmds++ = cp_type7_packet(CP_SET_MARKER, 1);
+
+	/*
+	 * Indicate the beginning  and end of the IB1 list with a SET_MARKER.
+	 * Among other things, this will implicitly enable and disable
+	 * preemption respectively.
+	 */
+	if (start)
+		*cmds++ = 0xD;
+	else
+		*cmds++ = 0xE;
+
+	return 2;
+}
+
+unsigned int a6xx_preemption_pre_ibsubmit(
+		struct adreno_device *adreno_dev,
+		struct adreno_ringbuffer *rb,
+		unsigned int *cmds, struct kgsl_context *context)
+{
+	unsigned int *cmds_orig = cmds;
+
+	*cmds++ = cp_type7_packet(CP_SET_PSEUDO_REGISTER, 12);
+
+	/* NULL SMMU_INFO buffer - we track in KMD */
+	*cmds++ = SET_PSEUDO_REGISTER_SAVE_REGISTER_SMMU_INFO;
+	cmds += cp_gpuaddr(adreno_dev, cmds, 0x0);
+
+	*cmds++ = SET_PSEUDO_REGISTER_SAVE_REGISTER_PRIV_NON_SECURE_SAVE_ADDR;
+	cmds += cp_gpuaddr(adreno_dev, cmds, rb->preemption_desc.gpuaddr);
+
+	*cmds++ = SET_PSEUDO_REGISTER_SAVE_REGISTER_PRIV_SECURE_SAVE_ADDR;
+	cmds += cp_gpuaddr(adreno_dev, cmds, 0);
+
+
+	/*
+	 * There is no need to specify this address when we are about to
+	 * trigger preemption. This is because CP internally stores this
+	 * address specified here in the CP_SET_PSEUDO_REGISTER payload to
+	 * the context record and thus knows from where to restore
+	 * the saved perfcounters for the new ringbuffer.
+	 */
+	*cmds++ = SET_PSEUDO_REGISTER_SAVE_REGISTER_COUNTER;
+	cmds += cp_gpuaddr(adreno_dev, cmds,
+			rb->perfcounter_save_restore_desc.gpuaddr);
+
+	return (unsigned int) (cmds - cmds_orig);
+}
+
+unsigned int a6xx_preemption_post_ibsubmit(struct adreno_device *adreno_dev,
+	unsigned int *cmds)
+{
+	unsigned int *cmds_orig = cmds;
+
+	*cmds++ = cp_type7_packet(CP_CONTEXT_SWITCH_YIELD, 4);
+	cmds += cp_gpuaddr(adreno_dev, cmds, 0x0);
+	*cmds++ = 1;
+	*cmds++ = 0;
+
+	return (unsigned int) (cmds - cmds_orig);
+}
+
+void a6xx_preemption_start(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	struct kgsl_iommu *iommu = KGSL_IOMMU_PRIV(device);
+	struct adreno_ringbuffer *rb;
+	unsigned int i;
+
+	if (!adreno_is_preemption_enabled(adreno_dev))
+		return;
+
+	/* Force the state to be clear */
+	adreno_set_preempt_state(adreno_dev, ADRENO_PREEMPT_NONE);
+
+	/* smmu_info is allocated and mapped in a6xx_preemption_iommu_init */
+	kgsl_sharedmem_writel(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(magic), A6XX_CP_SMMU_INFO_MAGIC_REF);
+	kgsl_sharedmem_writeq(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(ttbr0), MMU_DEFAULT_TTBR0(device));
+
+	/* The CP doesn't use the asid record, so poison it */
+	kgsl_sharedmem_writel(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(asid), 0xDECAFBAD);
+	kgsl_sharedmem_writel(device, &iommu->smmu_info,
+		PREEMPT_SMMU_RECORD(context_idr),
+		MMU_DEFAULT_CONTEXTIDR(device));
+
+	adreno_writereg64(adreno_dev,
+		ADRENO_REG_CP_CONTEXT_SWITCH_SMMU_INFO_LO,
+		ADRENO_REG_CP_CONTEXT_SWITCH_SMMU_INFO_HI,
+		iommu->smmu_info.gpuaddr);
+
+	FOR_EACH_RINGBUFFER(adreno_dev, rb, i) {
+		/*
+		 * preemption_desc is allocated and mapped at init time,
+		 * so no need to check sharedmem_writel return value
+		 */
+		kgsl_sharedmem_writel(device, &rb->preemption_desc,
+			PREEMPT_RECORD(rptr), 0);
+		kgsl_sharedmem_writel(device, &rb->preemption_desc,
+			PREEMPT_RECORD(wptr), 0);
+
+		adreno_ringbuffer_set_pagetable(rb,
+			device->mmu.defaultpagetable);
+	}
+}
+
+static int a6xx_preemption_ringbuffer_init(struct adreno_device *adreno_dev,
+	struct adreno_ringbuffer *rb, uint64_t counteraddr)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	int ret;
+
+	ret = kgsl_allocate_global(device, &rb->preemption_desc,
+		A6XX_CP_CTXRECORD_SIZE_IN_BYTES, 0, KGSL_MEMDESC_PRIVILEGED,
+		"preemption_desc");
+	if (ret)
+		return ret;
+
+	ret = kgsl_allocate_global(device, &rb->perfcounter_save_restore_desc,
+		A6XX_CP_PERFCOUNTER_SAVE_RESTORE_SIZE, 0,
+		KGSL_MEMDESC_PRIVILEGED, "perfcounter_save_restore_desc");
+	if (ret)
+		return ret;
+
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(magic), A6XX_CP_CTXRECORD_MAGIC_REF);
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(info), 0);
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(data), 0);
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(cntl), A6XX_CP_RB_CNTL_DEFAULT);
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(rptr), 0);
+	kgsl_sharedmem_writel(device, &rb->preemption_desc,
+		PREEMPT_RECORD(wptr), 0);
+	kgsl_sharedmem_writeq(device, &rb->preemption_desc,
+		PREEMPT_RECORD(rptr_addr), SCRATCH_RPTR_GPU_ADDR(device,
+		rb->id));
+	kgsl_sharedmem_writeq(device, &rb->preemption_desc,
+		PREEMPT_RECORD(rbase), rb->buffer_desc.gpuaddr);
+	kgsl_sharedmem_writeq(device, &rb->preemption_desc,
+		PREEMPT_RECORD(counter), counteraddr);
+
+	return 0;
+}
+
+#ifdef CONFIG_QCOM_KGSL_IOMMU
+static int a6xx_preemption_iommu_init(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	struct kgsl_iommu *iommu = KGSL_IOMMU_PRIV(device);
+
+	/* Allocate mem for storing preemption smmu record */
+	return kgsl_allocate_global(device, &iommu->smmu_info, PAGE_SIZE,
+		KGSL_MEMFLAGS_GPUREADONLY, KGSL_MEMDESC_PRIVILEGED,
+		"smmu_info");
+}
+
+static void a6xx_preemption_iommu_close(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	struct kgsl_iommu *iommu = KGSL_IOMMU_PRIV(device);
+
+	kgsl_free_global(device, &iommu->smmu_info);
+}
+#else
+static int a6xx_preemption_iommu_init(struct adreno_device *adreno_dev)
+{
+	return -ENODEV;
+}
+
+static void a6xx_preemption_iommu_close(struct adreno_device *adreno_dev)
+{
+}
+#endif
+
+static void a6xx_preemption_close(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct adreno_preemption *preempt = &adreno_dev->preempt;
+	struct adreno_ringbuffer *rb;
+	unsigned int i;
+
+	del_timer(&preempt->timer);
+	kgsl_free_global(device, &preempt->counters);
+	a6xx_preemption_iommu_close(adreno_dev);
+
+	FOR_EACH_RINGBUFFER(adreno_dev, rb, i) {
+		kgsl_free_global(device, &rb->preemption_desc);
+		kgsl_free_global(device, &rb->perfcounter_save_restore_desc);
+	}
+}
+
+int a6xx_preemption_init(struct adreno_device *adreno_dev)
+{
+	struct kgsl_device *device = KGSL_DEVICE(adreno_dev);
+	struct adreno_preemption *preempt = &adreno_dev->preempt;
+	struct adreno_ringbuffer *rb;
+	int ret;
+	unsigned int i;
+	uint64_t addr;
+
+	/* We are dependent on IOMMU to make preemption go on the CP side */
+	if (kgsl_mmu_get_mmutype(device) != KGSL_MMU_TYPE_IOMMU)
+		return -ENODEV;
+
+	INIT_WORK(&preempt->work, _a6xx_preemption_worker);
+
+	setup_timer(&preempt->timer, _a6xx_preemption_timer,
+		(unsigned long) adreno_dev);
+
+	/* Allocate mem for storing preemption counters */
+	ret = kgsl_allocate_global(device, &preempt->counters,
+		adreno_dev->num_ringbuffers *
+		A6XX_CP_CTXRECORD_PREEMPTION_COUNTER_SIZE, 0, 0,
+		"preemption_counters");
+	if (ret)
+		goto err;
+
+	addr = preempt->counters.gpuaddr;
+
+	/* Allocate mem for storing preemption switch record */
+	FOR_EACH_RINGBUFFER(adreno_dev, rb, i) {
+		ret = a6xx_preemption_ringbuffer_init(adreno_dev, rb, addr);
+		if (ret)
+			goto err;
+
+		addr += A6XX_CP_CTXRECORD_PREEMPTION_COUNTER_SIZE;
+	}
+
+	ret = a6xx_preemption_iommu_init(adreno_dev);
+
+err:
+	if (ret)
+		a6xx_preemption_close(device);
+
+	return ret;
+}
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index fceceda..2a330b4 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -55,6 +55,12 @@
 /* switches SMMU pagetable, used on a5xx only */
 #define CP_SMMU_TABLE_UPDATE 0x53
 
+/*  Set internal CP registers, used to indicate context save data addresses */
+#define CP_SET_PSEUDO_REGISTER      0x56
+
+/* Tell CP the current operation mode, indicates save and restore procedure */
+#define CP_SET_MARKER  0x65
+
 /* register read/modify/write */
 #define CP_REG_RMW		0x21
 
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index bff1fda..15c68fb 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -864,9 +864,12 @@
 			dwords += 2;
 	}
 
-	if (gpudev->preemption_yield_enable &&
-				adreno_is_preemption_enabled(adreno_dev))
-		dwords += 8;
+	if (adreno_is_preemption_enabled(adreno_dev)) {
+		if (gpudev->preemption_set_marker)
+			dwords += 4;
+		else if (gpudev->preemption_yield_enable)
+			dwords += 8;
+	}
 
 	link = kcalloc(dwords, sizeof(unsigned int), GFP_KERNEL);
 	if (!link) {
@@ -897,6 +900,10 @@
 			gpu_ticks_submitted));
 	}
 
+	if (gpudev->preemption_set_marker &&
+			adreno_is_preemption_enabled(adreno_dev))
+		cmds += gpudev->preemption_set_marker(cmds, 1);
+
 	if (numibs) {
 		list_for_each_entry(ib, &cmdobj->cmdlist, node) {
 			/*
@@ -918,9 +925,12 @@
 		}
 	}
 
-	if (gpudev->preemption_yield_enable &&
-				adreno_is_preemption_enabled(adreno_dev))
-		cmds += gpudev->preemption_yield_enable(cmds);
+	if (adreno_is_preemption_enabled(adreno_dev)) {
+		if (gpudev->preemption_set_marker)
+			cmds += gpudev->preemption_set_marker(cmds, 0);
+		else if (gpudev->preemption_yield_enable)
+			cmds += gpudev->preemption_yield_enable(cmds);
+	}
 
 	if (kernel_profiling) {
 		cmds += _get_alwayson_counter(adreno_dev, cmds,
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index 63374af..72fc5bf3 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2002,2007-2016, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2002,2007-2017, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -92,6 +92,8 @@
  * @drawctxt_active: The last pagetable that this ringbuffer is set to
  * @preemption_desc: The memory descriptor containing
  * preemption info written/read by CP
+ * @perfcounter_save_restore_desc: Used by CP to save/restore the perfcounter
+ * values across preemption
  * @pagetable_desc: Memory to hold information about the pagetables being used
  * and the commands to switch pagetable on the RB
  * @dispatch_q: The dispatcher side queue for this ringbuffer
@@ -118,6 +120,7 @@
 	struct kgsl_event_group events;
 	struct adreno_context *drawctxt_active;
 	struct kgsl_memdesc preemption_desc;
+	struct kgsl_memdesc perfcounter_save_restore_desc;
 	struct kgsl_memdesc pagetable_desc;
 	struct adreno_dispatcher_drawqueue dispatch_q;
 	wait_queue_head_t ts_expire_waitq;