Merge "msm: kgsl: Synchronize access to IOMMU cfg port"
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 373c517..55597fc 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -141,6 +141,7 @@
  */
 
 #define ANY_ID (~0)
+#define NO_VER (~0)
 
 static const struct {
 	enum adreno_gpurev gpurev;
@@ -150,45 +151,53 @@
 	struct adreno_gpudev *gpudev;
 	unsigned int istore_size;
 	unsigned int pix_shader_start;
-	unsigned int instruction_size; /* Size of an instruction in dwords */
-	unsigned int gmem_size; /* size of gmem for gpu*/
+	/* Size of an instruction in dwords */
+	unsigned int instruction_size;
+	/* size of gmem for gpu*/
+	unsigned int gmem_size;
+	/* version of pm4 microcode that supports sync_lock
+	   between CPU and GPU for SMMU-v1 programming */
+	unsigned int sync_lock_pm4_ver;
+	/* version of pfp microcode that supports sync_lock
+	   between CPU and GPU for SMMU-v1 programming */
+	unsigned int sync_lock_pfp_ver;
 } adreno_gpulist[] = {
 	{ ADRENO_REV_A200, 0, 2, ANY_ID, ANY_ID,
 		"yamato_pm4.fw", "yamato_pfp.fw", &adreno_a2xx_gpudev,
-		512, 384, 3, SZ_256K },
+		512, 384, 3, SZ_256K, NO_VER, NO_VER },
 	{ ADRENO_REV_A203, 0, 1, 1, ANY_ID,
 		"yamato_pm4.fw", "yamato_pfp.fw", &adreno_a2xx_gpudev,
-		512, 384, 3, SZ_256K },
+		512, 384, 3, SZ_256K, NO_VER, NO_VER },
 	{ ADRENO_REV_A205, 0, 1, 0, ANY_ID,
 		"yamato_pm4.fw", "yamato_pfp.fw", &adreno_a2xx_gpudev,
-		512, 384, 3, SZ_256K },
+		512, 384, 3, SZ_256K, NO_VER, NO_VER },
 	{ ADRENO_REV_A220, 2, 1, ANY_ID, ANY_ID,
 		"leia_pm4_470.fw", "leia_pfp_470.fw", &adreno_a2xx_gpudev,
-		512, 384, 3, SZ_512K },
+		512, 384, 3, SZ_512K, NO_VER, NO_VER },
 	/*
 	 * patchlevel 5 (8960v2) needs special pm4 firmware to work around
 	 * a hardware problem.
 	 */
 	{ ADRENO_REV_A225, 2, 2, 0, 5,
 		"a225p5_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
-		1536, 768, 3, SZ_512K },
+		1536, 768, 3, SZ_512K, NO_VER, NO_VER },
 	{ ADRENO_REV_A225, 2, 2, 0, 6,
 		"a225_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
-		1536, 768, 3, SZ_512K },
+		1536, 768, 3, SZ_512K, 0x225011, 0x225002 },
 	{ ADRENO_REV_A225, 2, 2, ANY_ID, ANY_ID,
 		"a225_pm4.fw", "a225_pfp.fw", &adreno_a2xx_gpudev,
-		1536, 768, 3, SZ_512K },
+		1536, 768, 3, SZ_512K, 0x225011, 0x225002 },
 	/* A3XX doesn't use the pix_shader_start */
 	{ ADRENO_REV_A305, 3, 0, 5, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
-		512, 0, 2, SZ_256K },
+		512, 0, 2, SZ_256K, 0x3FF037, 0x3FF016 },
 	/* A3XX doesn't use the pix_shader_start */
 	{ ADRENO_REV_A320, 3, 2, ANY_ID, ANY_ID,
 		"a300_pm4.fw", "a300_pfp.fw", &adreno_a3xx_gpudev,
-		512, 0, 2, SZ_512K },
+		512, 0, 2, SZ_512K, 0x3FF037, 0x3FF016 },
 	{ ADRENO_REV_A330, 3, 3, 0, 0,
 		"a330_pm4.fw", "a330_pfp.fw", &adreno_a3xx_gpudev,
-		512, 0, 2, SZ_1M },
+		512, 0, 2, SZ_1M, NO_VER, NO_VER },
 };
 
 static irqreturn_t adreno_irq_handler(struct kgsl_device *device)
@@ -282,7 +291,7 @@
 					uint32_t flags)
 {
 	unsigned int pt_val, reg_pt_val;
-	unsigned int link[200];
+	unsigned int link[250];
 	unsigned int *cmds = &link[0];
 	int sizedwords = 0;
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
@@ -315,6 +324,11 @@
 					device->mmu.setstate_memory.gpuaddr +
 					KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
+	/* Acquire GPU-CPU sync Lock here */
+	cmds += kgsl_mmu_sync_lock(&device->mmu, cmds);
+
 	pt_val = kgsl_mmu_get_pt_base_addr(&device->mmu,
 					device->mmu.hwpagetable);
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
@@ -376,6 +390,9 @@
 		}
 	}
 
+	/* Release GPU-CPU sync Lock here */
+	cmds += kgsl_mmu_sync_unlock(&device->mmu, cmds);
+
 	if (cpu_is_msm8960())
 		cmds += adreno_add_change_mh_phys_limit_cmds(cmds,
 			kgsl_mmu_get_reg_gpuaddr(&device->mmu, 0,
@@ -388,6 +405,8 @@
 			device->mmu.setstate_memory.gpuaddr +
 			KGSL_IOMMU_SETSTATE_NOP_OFFSET);
 
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
 	sizedwords += (cmds - &link[0]);
 	if (sizedwords) {
 		/* invalidate all base pointers */
@@ -402,6 +421,11 @@
 		kgsl_mmu_disable_clk_on_ts(&device->mmu,
 		adreno_dev->ringbuffer.timestamp[KGSL_MEMSTORE_GLOBAL], true);
 	}
+
+	if (sizedwords > (ARRAY_SIZE(link))) {
+		KGSL_DRV_ERR(device, "Temp command buffer overflow\n");
+		BUG();
+	}
 }
 
 static void adreno_gpummu_setstate(struct kgsl_device *device,
@@ -637,6 +661,7 @@
 	adreno_dev->pix_shader_start = adreno_gpulist[i].pix_shader_start;
 	adreno_dev->instruction_size = adreno_gpulist[i].instruction_size;
 	adreno_dev->gmem_size = adreno_gpulist[i].gmem_size;
+	adreno_dev->gpulist_index = i;
 }
 
 static struct platform_device_id adreno_id_table[] = {
@@ -1186,12 +1211,36 @@
 	/* Identify the specific GPU */
 	adreno_identify_gpu(adreno_dev);
 
+	if (adreno_ringbuffer_read_pm4_ucode(device)) {
+		KGSL_DRV_ERR(device, "Reading pm4 microcode failed %s\n",
+			adreno_dev->pm4_fwfile);
+		BUG_ON(1);
+	}
+
+	if (adreno_ringbuffer_read_pfp_ucode(device)) {
+		KGSL_DRV_ERR(device, "Reading pfp microcode failed %s\n",
+			adreno_dev->pfp_fwfile);
+		BUG_ON(1);
+	}
+
 	if (adreno_dev->gpurev == ADRENO_REV_UNKNOWN) {
 		KGSL_DRV_ERR(device, "Unknown chip ID %x\n",
 			adreno_dev->chip_id);
 		goto error_clk_off;
 	}
 
+
+	/*
+	 * Check if firmware supports the sync lock PM4 packets needed
+	 * for IOMMUv1
+	 */
+
+	if ((adreno_dev->pm4_fw_version >=
+		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pm4_ver) &&
+		(adreno_dev->pfp_fw_version >=
+		adreno_gpulist[adreno_dev->gpulist_index].sync_lock_pfp_ver))
+		device->mmu.flags |= KGSL_MMU_FLAGS_IOMMU_SYNC;
+
 	/* Set up the MMU */
 	if (adreno_is_a2xx(adreno_dev)) {
 		/*
diff --git a/drivers/gpu/msm/adreno.h b/drivers/gpu/msm/adreno.h
index cf16995..836192c 100644
--- a/drivers/gpu/msm/adreno.h
+++ b/drivers/gpu/msm/adreno.h
@@ -86,9 +86,11 @@
 	const char *pfp_fwfile;
 	unsigned int *pfp_fw;
 	size_t pfp_fw_size;
+	unsigned int pfp_fw_version;
 	const char *pm4_fwfile;
 	unsigned int *pm4_fw;
 	size_t pm4_fw_size;
+	unsigned int pm4_fw_version;
 	struct adreno_ringbuffer ringbuffer;
 	unsigned int mharb;
 	struct adreno_gpudev *gpudev;
@@ -98,6 +100,7 @@
 	unsigned int instruction_size;
 	unsigned int ib_check_level;
 	unsigned int fast_hang_detect;
+	unsigned int gpulist_index;
 	struct ocmem_buf *ocmem_hdl;
 	unsigned int ocmem_base;
 };
@@ -366,4 +369,26 @@
 	return cmds - start;
 }
 
+/*
+ * adreno_idle_cmds - Add pm4 packets for GPU idle
+ * @adreno_dev - Pointer to device structure
+ * @cmds - Pointer to memory where idle commands need to be added
+ */
+static inline int adreno_add_idle_cmds(struct adreno_device *adreno_dev,
+							unsigned int *cmds)
+{
+	unsigned int *start = cmds;
+
+	*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1);
+	*cmds++ = 0;
+
+	if ((adreno_dev->gpurev == ADRENO_REV_A305) ||
+		(adreno_dev->gpurev == ADRENO_REV_A320)) {
+		*cmds++ = cp_type3_packet(CP_WAIT_FOR_ME, 1);
+		*cmds++ = 0;
+	}
+
+	return cmds - start;
+}
+
 #endif /*__ADRENO_H */
diff --git a/drivers/gpu/msm/adreno_pm4types.h b/drivers/gpu/msm/adreno_pm4types.h
index 016862b..6ec11ea 100644
--- a/drivers/gpu/msm/adreno_pm4types.h
+++ b/drivers/gpu/msm/adreno_pm4types.h
@@ -142,6 +142,12 @@
 /* copy sequencer instruction memory to system memory */
 #define CP_IM_STORE            0x2c
 
+/* test 2 memory locations to dword values specified */
+#define CP_TEST_TWO_MEMS	0x71
+
+/* PFP waits until the FIFO between the PFP and the ME is empty */
+#define CP_WAIT_FOR_ME		0x13
+
 /*
  * for a20x
  * program an offset that will added to the BIN_BASE value of
diff --git a/drivers/gpu/msm/adreno_ringbuffer.c b/drivers/gpu/msm/adreno_ringbuffer.c
index 8af361a..97b35b0 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.c
+++ b/drivers/gpu/msm/adreno_ringbuffer.c
@@ -209,10 +209,10 @@
 	return (*data != NULL) ? 0 : -ENOMEM;
 }
 
-static int adreno_ringbuffer_load_pm4_ucode(struct kgsl_device *device)
+int adreno_ringbuffer_read_pm4_ucode(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	int i, ret = 0;
+	int ret = 0;
 
 	if (adreno_dev->pm4_fw == NULL) {
 		int len;
@@ -234,24 +234,41 @@
 
 		adreno_dev->pm4_fw_size = len / sizeof(uint32_t);
 		adreno_dev->pm4_fw = ptr;
+		adreno_dev->pm4_fw_version = adreno_dev->pm4_fw[1];
+	}
+
+err:
+	return ret;
+}
+
+
+int adreno_ringbuffer_load_pm4_ucode(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	int i;
+
+	if (adreno_dev->pm4_fw == NULL) {
+		int ret = adreno_ringbuffer_read_pm4_ucode(device);
+		if (ret)
+			return ret;
 	}
 
 	KGSL_DRV_INFO(device, "loading pm4 ucode version: %d\n",
-		adreno_dev->pm4_fw[0]);
+		adreno_dev->pm4_fw_version);
 
 	adreno_regwrite(device, REG_CP_DEBUG, CP_DEBUG_DEFAULT);
 	adreno_regwrite(device, REG_CP_ME_RAM_WADDR, 0);
 	for (i = 1; i < adreno_dev->pm4_fw_size; i++)
 		adreno_regwrite(device, REG_CP_ME_RAM_DATA,
-				     adreno_dev->pm4_fw[i]);
-err:
-	return ret;
+			adreno_dev->pm4_fw[i]);
+
+	return 0;
 }
 
-static int adreno_ringbuffer_load_pfp_ucode(struct kgsl_device *device)
+int adreno_ringbuffer_read_pfp_ucode(struct kgsl_device *device)
 {
 	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
-	int i, ret = 0;
+	int ret = 0;
 
 	if (adreno_dev->pfp_fw == NULL) {
 		int len;
@@ -272,18 +289,34 @@
 
 		adreno_dev->pfp_fw_size = len / sizeof(uint32_t);
 		adreno_dev->pfp_fw = ptr;
+		adreno_dev->pfp_fw_version = adreno_dev->pfp_fw[5];
+	}
+
+err:
+	return ret;
+}
+
+int adreno_ringbuffer_load_pfp_ucode(struct kgsl_device *device)
+{
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	int i;
+
+	if (adreno_dev->pfp_fw == NULL) {
+		int ret = adreno_ringbuffer_read_pfp_ucode(device);
+		if (ret)
+			return ret;
 	}
 
 	KGSL_DRV_INFO(device, "loading pfp ucode version: %d\n",
-		adreno_dev->pfp_fw[0]);
+			adreno_dev->pfp_fw_version);
 
 	adreno_regwrite(device, adreno_dev->gpudev->reg_cp_pfp_ucode_addr, 0);
 	for (i = 1; i < adreno_dev->pfp_fw_size; i++)
 		adreno_regwrite(device,
-			adreno_dev->gpudev->reg_cp_pfp_ucode_data,
-			adreno_dev->pfp_fw[i]);
-err:
-	return ret;
+		adreno_dev->gpudev->reg_cp_pfp_ucode_data,
+		adreno_dev->pfp_fw[i]);
+
+	return 0;
 }
 
 int adreno_ringbuffer_start(struct adreno_ringbuffer *rb, unsigned int init_ram)
@@ -390,7 +423,6 @@
 			     GSL_RB_MEMPTRS_SCRATCH_MASK);
 
 	/* load the CP ucode */
-
 	status = adreno_ringbuffer_load_pm4_ucode(device);
 	if (status != 0)
 		return status;
diff --git a/drivers/gpu/msm/adreno_ringbuffer.h b/drivers/gpu/msm/adreno_ringbuffer.h
index 4f58a15..50d9c25 100644
--- a/drivers/gpu/msm/adreno_ringbuffer.h
+++ b/drivers/gpu/msm/adreno_ringbuffer.h
@@ -130,6 +130,10 @@
 						struct adreno_context *context,
 						unsigned int numcmds);
 
+int adreno_ringbuffer_read_pfp_ucode(struct kgsl_device *device);
+
+int adreno_ringbuffer_read_pm4_ucode(struct kgsl_device *device);
+
 static inline int adreno_ringbuffer_count(struct adreno_ringbuffer *rb,
 	unsigned int rptr)
 {
diff --git a/drivers/gpu/msm/kgsl_iommu.c b/drivers/gpu/msm/kgsl_iommu.c
index 31491d5..b647ae2 100644
--- a/drivers/gpu/msm/kgsl_iommu.c
+++ b/drivers/gpu/msm/kgsl_iommu.c
@@ -18,6 +18,9 @@
 #include <linux/iommu.h>
 #include <linux/msm_kgsl.h>
 #include <mach/socinfo.h>
+#include <mach/msm_iomap.h>
+#include <mach/board.h>
+#include <stddef.h>
 
 #include "kgsl.h"
 #include "kgsl_device.h"
@@ -27,6 +30,8 @@
 #include "adreno_pm4types.h"
 #include "adreno.h"
 #include "kgsl_trace.h"
+#include "z180.h"
+
 
 static struct kgsl_iommu_register_list kgsl_iommuv1_reg[KGSL_IOMMU_REG_MAX] = {
 	{ 0, 0, 0 },				/* GLOBAL_BASE */
@@ -46,6 +51,8 @@
 	{ 0x008, 0, 0 }				/* RESUME */
 };
 
+struct remote_iommu_petersons_spinlock kgsl_iommu_sync_lock_vars;
+
 static int get_iommu_unit(struct device *dev, struct kgsl_mmu **mmu_out,
 			struct kgsl_iommu_unit **iommu_unit_out)
 {
@@ -546,6 +553,195 @@
 }
 
 /*
+ * kgsl_iommu_start_sync_lock - Initialize some variables during MMU start up
+ * for GPU CPU synchronization
+ * @mmu - Pointer to mmu device
+ *
+ * Return - 0 on success else error code
+ */
+static int kgsl_iommu_start_sync_lock(struct kgsl_mmu *mmu)
+{
+	struct kgsl_iommu *iommu = mmu->priv;
+	uint32_t lock_gpu_addr = 0;
+
+	if (KGSL_DEVICE_3D0 != mmu->device->id ||
+		!msm_soc_version_supports_iommu_v1() ||
+		!kgsl_mmu_is_perprocess() ||
+		iommu->sync_lock_vars)
+		return 0;
+
+	if (!(mmu->flags & KGSL_MMU_FLAGS_IOMMU_SYNC)) {
+		KGSL_DRV_ERR(mmu->device,
+		"The GPU microcode does not support IOMMUv1 sync opcodes\n");
+		return -ENXIO;
+	}
+	/* Store Lock variables GPU address  */
+	lock_gpu_addr = (iommu->sync_lock_desc.gpuaddr +
+			iommu->sync_lock_offset);
+
+	kgsl_iommu_sync_lock_vars.flag[PROC_APPS] = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock,
+			flag[PROC_APPS])));
+	kgsl_iommu_sync_lock_vars.flag[PROC_GPU] = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock,
+			flag[PROC_GPU])));
+	kgsl_iommu_sync_lock_vars.turn = (lock_gpu_addr +
+		(offsetof(struct remote_iommu_petersons_spinlock, turn)));
+
+	iommu->sync_lock_vars = &kgsl_iommu_sync_lock_vars;
+
+	return 0;
+}
+
+/*
+ * kgsl_get_sync_lock - Init Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ *
+ * Return - 0 on success else error code
+ */
+static int kgsl_iommu_init_sync_lock(struct kgsl_mmu *mmu)
+{
+	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
+	int status = 0;
+	uint32_t lock_phy_addr = 0;
+	uint32_t page_offset = 0;
+
+	if (KGSL_DEVICE_3D0 != mmu->device->id ||
+		!msm_soc_version_supports_iommu_v1() ||
+		!kgsl_mmu_is_perprocess())
+		return status;
+
+	/* Return if already initialized */
+	if (iommu->sync_lock_initialized)
+		return status;
+
+	/* Get the physical address of the Lock variables */
+	lock_phy_addr = (msm_iommu_lock_initialize()
+			- MSM_SHARED_RAM_BASE + msm_shared_ram_phys);
+
+	if (!lock_phy_addr) {
+		KGSL_DRV_ERR(mmu->device,
+				"GPU CPU sync lock is not supported by kernel\n");
+		return -ENXIO;
+	}
+
+	/* Align the physical address to PAGE boundary and store the offset */
+	page_offset = (lock_phy_addr & (PAGE_SIZE - 1));
+	lock_phy_addr = (lock_phy_addr & ~(PAGE_SIZE - 1));
+	iommu->sync_lock_desc.physaddr = (unsigned int)lock_phy_addr;
+	iommu->sync_lock_offset = page_offset;
+
+	iommu->sync_lock_desc.size =
+				PAGE_ALIGN(sizeof(kgsl_iommu_sync_lock_vars));
+	status =  memdesc_sg_phys(&iommu->sync_lock_desc,
+				 iommu->sync_lock_desc.physaddr,
+				 iommu->sync_lock_desc.size);
+
+	if (status)
+		return status;
+
+	/* Flag Sync Lock is Initialized  */
+	iommu->sync_lock_initialized = 1;
+
+	return status;
+}
+
+/*
+ * kgsl_iommu_sync_lock - Acquire Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ * @cmds - Pointer to array of commands
+ *
+ * Return - int - number of commands.
+ */
+inline unsigned int kgsl_iommu_sync_lock(struct kgsl_mmu *mmu,
+						unsigned int *cmds)
+{
+	struct kgsl_device *device = mmu->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
+	struct remote_iommu_petersons_spinlock *lock_vars =
+					iommu->sync_lock_vars;
+	unsigned int *start = cmds;
+
+	if (!iommu->sync_lock_initialized)
+		return 0;
+
+	*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+	*cmds++ = lock_vars->flag[PROC_GPU];
+	*cmds++ = 1;
+
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
+	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
+	/* MEM SPACE = memory, FUNCTION = equals */
+	*cmds++ = 0x13;
+	*cmds++ = lock_vars->flag[PROC_GPU];
+	*cmds++ = 0x1;
+	*cmds++ = 0x1;
+	*cmds++ = 0x1;
+
+	*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+	*cmds++ = lock_vars->turn;
+	*cmds++ = 0;
+
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
+	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
+	/* MEM SPACE = memory, FUNCTION = equals */
+	*cmds++ = 0x13;
+	*cmds++ = lock_vars->flag[PROC_GPU];
+	*cmds++ = 0x1;
+	*cmds++ = 0x1;
+	*cmds++ = 0x1;
+
+	*cmds++ = cp_type3_packet(CP_TEST_TWO_MEMS, 3);
+	*cmds++ = lock_vars->flag[PROC_APPS];
+	*cmds++ = lock_vars->turn;
+	*cmds++ = 0;
+
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
+	return cmds - start;
+}
+
+/*
+ * kgsl_iommu_sync_lock - Release Sync Lock between GPU and CPU
+ * @mmu - Pointer to mmu device
+ * @cmds - Pointer to array of commands
+ *
+ * Return - int - number of commands.
+ */
+inline unsigned int kgsl_iommu_sync_unlock(struct kgsl_mmu *mmu,
+					unsigned int *cmds)
+{
+	struct kgsl_device *device = mmu->device;
+	struct adreno_device *adreno_dev = ADRENO_DEVICE(device);
+	struct kgsl_iommu *iommu = mmu->device->mmu.priv;
+	struct remote_iommu_petersons_spinlock *lock_vars =
+						iommu->sync_lock_vars;
+	unsigned int *start = cmds;
+
+	if (!iommu->sync_lock_initialized)
+		return 0;
+
+	*cmds++ = cp_type3_packet(CP_MEM_WRITE, 2);
+	*cmds++ = lock_vars->flag[PROC_GPU];
+	*cmds++ = 0;
+
+	*cmds++ = cp_type3_packet(CP_WAIT_REG_MEM, 5);
+	/* MEM SPACE = memory, FUNCTION = equals */
+	*cmds++ = 0x13;
+	*cmds++ = lock_vars->flag[PROC_GPU];
+	*cmds++ = 0x0;
+	*cmds++ = 0x1;
+	*cmds++ = 0x1;
+
+	cmds += adreno_add_idle_cmds(adreno_dev, cmds);
+
+	return cmds - start;
+}
+
+/*
  * kgsl_get_iommu_ctxt - Get device pointer to IOMMU contexts
  * @mmu - Pointer to mmu device
  *
@@ -727,23 +923,27 @@
 		return 0;
 
 	for (i = 0; i < iommu->unit_count; i++) {
-		iommu->iommu_units[i].reg_map.priv |= KGSL_MEMDESC_GLOBAL;
-		status = kgsl_mmu_map(pt,
+		status = kgsl_mmu_map_global(pt,
 				&(iommu->iommu_units[i].reg_map),
 				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
-		if (status) {
-			iommu->iommu_units[i].reg_map.priv &=
-				~KGSL_MEMDESC_GLOBAL;
+		if (status)
 			goto err;
-		}
 	}
+
+	/* Map Lock variables to GPU pagetable */
+	if (iommu->sync_lock_initialized) {
+		status = kgsl_mmu_map_global(pt, &iommu->sync_lock_desc,
+				GSL_PT_PAGE_RV | GSL_PT_PAGE_WV);
+		if (status)
+			goto err;
+	}
+
 	return 0;
 err:
-	for (i--; i >= 0; i--) {
+	for (i--; i >= 0; i--)
 		kgsl_mmu_unmap(pt,
 				&(iommu->iommu_units[i].reg_map));
-		iommu->iommu_units[i].reg_map.priv &= ~KGSL_MEMDESC_GLOBAL;
-	}
+
 	return status;
 }
 
@@ -763,6 +963,9 @@
 	int i;
 	for (i = 0; i < iommu->unit_count; i++)
 		kgsl_mmu_unmap(pt, &(iommu->iommu_units[i].reg_map));
+
+	if (iommu->sync_lock_desc.gpuaddr)
+		kgsl_mmu_unmap(pt, &iommu->sync_lock_desc);
 }
 
 
@@ -790,6 +993,9 @@
 	status = kgsl_set_register_map(mmu);
 	if (status)
 		goto done;
+	status = kgsl_iommu_init_sync_lock(mmu);
+	if (status)
+		goto done;
 
 	iommu->iommu_reg_list = kgsl_iommuv1_reg;
 	iommu->ctx_offset = KGSL_IOMMU_CTX_OFFSET_V1;
@@ -887,6 +1093,10 @@
 		if (status)
 			return -ENOMEM;
 	}
+	status = kgsl_iommu_start_sync_lock(mmu);
+	if (status)
+		return status;
+
 	/* We use the GPU MMU to control access to IOMMU registers on 8960 with
 	 * a225, hence we still keep the MMU active on 8960 */
 	if (cpu_is_msm8960()) {
@@ -1068,7 +1278,12 @@
 		if (reg_map->hostptr)
 			iounmap(reg_map->hostptr);
 		kgsl_sg_free(reg_map->sg, reg_map->sglen);
+		reg_map->priv &= ~KGSL_MEMDESC_GLOBAL;
 	}
+	/* clear IOMMU GPU CPU sync structures */
+	kgsl_sg_free(iommu->sync_lock_desc.sg, iommu->sync_lock_desc.sglen);
+	memset(&iommu->sync_lock_desc, 0, sizeof(iommu->sync_lock_desc));
+	iommu->sync_lock_vars = NULL;
 
 	kfree(iommu);
 
@@ -1125,6 +1340,9 @@
 	pt_base &= (iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_mask <<
 			iommu->iommu_reg_list[KGSL_IOMMU_CTX_TTBR0].reg_shift);
 
+	/* Acquire GPU-CPU sync Lock here */
+	msm_iommu_lock();
+
 	if (flags & KGSL_MMUFLAGS_PTUPDATE) {
 		kgsl_idle(mmu->device);
 		for (i = 0; i < iommu->unit_count; i++) {
@@ -1151,6 +1369,10 @@
 			mb();
 		}
 	}
+
+	/* Release GPU-CPU sync Lock here */
+	msm_iommu_unlock();
+
 	/* Disable smmu clock */
 	kgsl_iommu_disable_clk_on_ts(mmu, 0, false);
 }
@@ -1203,6 +1425,8 @@
 	/* These callbacks will be set on some chipsets */
 	.mmu_setup_pt = NULL,
 	.mmu_cleanup_pt = NULL,
+	.mmu_sync_lock = kgsl_iommu_sync_lock,
+	.mmu_sync_unlock = kgsl_iommu_sync_unlock,
 };
 
 struct kgsl_mmu_pt_ops iommu_pt_ops = {
diff --git a/drivers/gpu/msm/kgsl_iommu.h b/drivers/gpu/msm/kgsl_iommu.h
index 661b4f0..f539b07 100644
--- a/drivers/gpu/msm/kgsl_iommu.h
+++ b/drivers/gpu/msm/kgsl_iommu.h
@@ -120,6 +120,13 @@
  * @ctx_offset: The context offset to be added to base address when
  * accessing IOMMU registers
  * @iommu_reg_list: List of IOMMU registers { offset, map, shift } array
+ * @sync_lock_vars: Pointer to the IOMMU spinlock for serializing access to the
+ * IOMMU registers
+ * @sync_lock_desc: GPU Memory descriptor for the memory containing the
+ * spinlocks
+ * @sync_lock_offset - The page offset within a page at which the sync
+ * variables are located
+ * @sync_lock_initialized: True if the sync_lock feature is enabled
  */
 struct kgsl_iommu {
 	struct kgsl_iommu_unit iommu_units[KGSL_IOMMU_MAX_UNITS];
@@ -129,6 +136,10 @@
 	struct kgsl_device *device;
 	unsigned int ctx_offset;
 	struct kgsl_iommu_register_list *iommu_reg_list;
+	struct remote_iommu_petersons_spinlock *sync_lock_vars;
+	struct kgsl_memdesc sync_lock_desc;
+	unsigned int sync_lock_offset;
+	bool sync_lock_initialized;
 };
 
 /*
diff --git a/drivers/gpu/msm/kgsl_mmu.h b/drivers/gpu/msm/kgsl_mmu.h
index c8d637e..9d96633 100644
--- a/drivers/gpu/msm/kgsl_mmu.h
+++ b/drivers/gpu/msm/kgsl_mmu.h
@@ -152,6 +152,10 @@
 			struct kgsl_pagetable *pt);
 	void (*mmu_cleanup_pt) (struct kgsl_mmu *mmu,
 			struct kgsl_pagetable *pt);
+	unsigned int (*mmu_sync_lock)
+			(struct kgsl_mmu *mmu, unsigned int *cmds);
+	unsigned int (*mmu_sync_unlock)
+			(struct kgsl_mmu *mmu, unsigned int *cmds);
 };
 
 struct kgsl_mmu_pt_ops {
@@ -166,6 +170,8 @@
 	void (*mmu_destroy_pagetable) (void *pt);
 };
 
+#define KGSL_MMU_FLAGS_IOMMU_SYNC BIT(31)
+
 struct kgsl_mmu {
 	unsigned int     refcnt;
 	uint32_t      flags;
@@ -398,4 +404,24 @@
 	return 0;
 }
 
+static inline int kgsl_mmu_sync_lock(struct kgsl_mmu *mmu,
+				unsigned int *cmds)
+{
+	if ((mmu->flags & KGSL_MMU_FLAGS_IOMMU_SYNC) &&
+		mmu->mmu_ops && mmu->mmu_ops->mmu_sync_lock)
+		return mmu->mmu_ops->mmu_sync_lock(mmu, cmds);
+	else
+		return 0;
+}
+
+static inline int kgsl_mmu_sync_unlock(struct kgsl_mmu *mmu,
+				unsigned int *cmds)
+{
+	if ((mmu->flags & KGSL_MMU_FLAGS_IOMMU_SYNC) &&
+		mmu->mmu_ops && mmu->mmu_ops->mmu_sync_unlock)
+		return mmu->mmu_ops->mmu_sync_unlock(mmu, cmds);
+	else
+		return 0;
+}
+
 #endif /* __KGSL_MMU_H */