drm/msm: add a3xx gpu support

Add initial support for a3xx 3d core.

So far, with hardware that I've seen to date, we can have:
 + zero, one, or two z180 2d cores
 + a3xx or a2xx 3d core, which share a common CP (the firmware
   for the CP seems to implement some different PM4 packet types
   but the basics of cmdstream submission are the same)

Which means that the eventual complete "class" hierarchy, once
support for all past and present hw is in place, becomes:
 + msm_gpu
   + adreno_gpu
     + a3xx_gpu
     + a2xx_gpu
   + z180_gpu

This commit splits out the parts that will eventually be common
between a2xx/a3xx into adreno_gpu, and the parts that are even
common to z180 into msm_gpu.

Note that there is no cmdstream validation required.  All memory access
from the GPU is via IOMMU/MMU.  So as long as you don't map silly things
to the GPU, there isn't much damage that the GPU can do.

Signed-off-by: Rob Clark <robdclark@gmail.com>
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 4068122..439dfb5 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -4,6 +4,8 @@
 endif
 
 msm-y := \
+	adreno/adreno_gpu.o \
+	adreno/a3xx_gpu.o \
 	hdmi/hdmi.o \
 	hdmi/hdmi_connector.o \
 	hdmi/hdmi_i2c.o \
@@ -18,7 +20,10 @@
 	msm_connector.o \
 	msm_drv.o \
 	msm_fb.o \
-	msm_gem.o
+	msm_gem.o \
+	msm_gem_submit.o \
+	msm_gpu.o \
+	msm_ringbuffer.o
 
 msm-$(CONFIG_DRM_MSM_FBDEV) += msm_fbdev.o
 
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
new file mode 100644
index 0000000..13d61bb
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "a3xx_gpu.h"
+
+#define A3XX_INT0_MASK \
+	(A3XX_INT0_RBBM_AHB_ERROR |        \
+	 A3XX_INT0_RBBM_ATB_BUS_OVERFLOW | \
+	 A3XX_INT0_CP_T0_PACKET_IN_IB |    \
+	 A3XX_INT0_CP_OPCODE_ERROR |       \
+	 A3XX_INT0_CP_RESERVED_BIT_ERROR | \
+	 A3XX_INT0_CP_HW_FAULT |           \
+	 A3XX_INT0_CP_IB1_INT |            \
+	 A3XX_INT0_CP_IB2_INT |            \
+	 A3XX_INT0_CP_RB_INT |             \
+	 A3XX_INT0_CP_REG_PROTECT_FAULT |  \
+	 A3XX_INT0_CP_AHB_ERROR_HALT |     \
+	 A3XX_INT0_UCHE_OOB_ACCESS)
+
+static struct platform_device *a3xx_pdev;
+
+static void a3xx_me_init(struct msm_gpu *gpu)
+{
+	struct msm_ringbuffer *ring = gpu->rb;
+
+	OUT_PKT3(ring, CP_ME_INIT, 17);
+	OUT_RING(ring, 0x000003f7);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000080);
+	OUT_RING(ring, 0x00000100);
+	OUT_RING(ring, 0x00000180);
+	OUT_RING(ring, 0x00006600);
+	OUT_RING(ring, 0x00000150);
+	OUT_RING(ring, 0x0000014e);
+	OUT_RING(ring, 0x00000154);
+	OUT_RING(ring, 0x00000001);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+
+	gpu->funcs->flush(gpu);
+	gpu->funcs->idle(gpu);
+}
+
+static int a3xx_hw_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	uint32_t *ptr, len;
+	int i, ret;
+
+	DBG("%s", gpu->name);
+
+	if (adreno_is_a305(adreno_gpu)) {
+		/* Set up 16 deep read/write request queues: */
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF1, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_RD_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_WR_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF1, 0x10101010);
+		/* Enable WR-REQ: */
+		gpu_write(gpu, REG_A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x0000ff);
+		/* Set up round robin arbitration between both AXI ports: */
+		gpu_write(gpu, REG_A3XX_VBIF_ARB_CTL, 0x00000030);
+		/* Set up AOOO: */
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000003c);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO, 0x003c003c);
+
+	} else if (adreno_is_a320(adreno_gpu)) {
+		/* Set up 16 deep read/write request queues: */
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF1, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_RD_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_WR_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF0, 0x10101010);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF1, 0x10101010);
+		/* Enable WR-REQ: */
+		gpu_write(gpu, REG_A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x0000ff);
+		/* Set up round robin arbitration between both AXI ports: */
+		gpu_write(gpu, REG_A3XX_VBIF_ARB_CTL, 0x00000030);
+		/* Set up AOOO: */
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000003c);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO, 0x003c003c);
+		/* Enable 1K sort: */
+		gpu_write(gpu, REG_A3XX_VBIF_ABIT_SORT, 0x000000ff);
+		gpu_write(gpu, REG_A3XX_VBIF_ABIT_SORT_CONF, 0x000000a4);
+
+	} else if (adreno_is_a330(adreno_gpu)) {
+		/* Set up 16 deep read/write request queues: */
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF0, 0x18181818);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_RD_LIM_CONF1, 0x18181818);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_RD_LIM_CONF0, 0x18181818);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_WR_LIM_CONF0, 0x18181818);
+		gpu_write(gpu, REG_A3XX_VBIF_DDR_OUT_MAX_BURST, 0x0000303);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF0, 0x18181818);
+		gpu_write(gpu, REG_A3XX_VBIF_IN_WR_LIM_CONF1, 0x18181818);
+		/* Enable WR-REQ: */
+		gpu_write(gpu, REG_A3XX_VBIF_GATE_OFF_WRREQ_EN, 0x00003f);
+		/* Set up round robin arbitration between both AXI ports: */
+		gpu_write(gpu, REG_A3XX_VBIF_ARB_CTL, 0x00000030);
+		/* Set up VBIF_ROUND_ROBIN_QOS_ARB: */
+		gpu_write(gpu, REG_A3XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x0001);
+		/* Set up AOOO: */
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO_EN, 0x0000ffff);
+		gpu_write(gpu, REG_A3XX_VBIF_OUT_AXI_AOOO, 0xffffffff);
+		/* Enable 1K sort: */
+		gpu_write(gpu, REG_A3XX_VBIF_ABIT_SORT, 0x0001ffff);
+		gpu_write(gpu, REG_A3XX_VBIF_ABIT_SORT_CONF, 0x000000a4);
+		/* Disable VBIF clock gating. This is to enable AXI running
+		 * higher frequency than GPU:
+		 */
+		gpu_write(gpu, REG_A3XX_VBIF_CLKON, 0x00000001);
+
+	} else {
+		BUG();
+	}
+
+	/* Make all blocks contribute to the GPU BUSY perf counter: */
+	gpu_write(gpu, REG_A3XX_RBBM_GPU_BUSY_MASKED, 0xffffffff);
+
+	/* Tune the hystersis counters for SP and CP idle detection: */
+	gpu_write(gpu, REG_A3XX_RBBM_SP_HYST_CNT, 0x10);
+	gpu_write(gpu, REG_A3XX_RBBM_WAIT_IDLE_CLOCKS_CTL, 0x10);
+
+	/* Enable the RBBM error reporting bits.  This lets us get
+	 * useful information on failure:
+	 */
+	gpu_write(gpu, REG_A3XX_RBBM_AHB_CTL0, 0x00000001);
+
+	/* Enable AHB error reporting: */
+	gpu_write(gpu, REG_A3XX_RBBM_AHB_CTL1, 0xa6ffffff);
+
+	/* Turn on the power counters: */
+	gpu_write(gpu, REG_A3XX_RBBM_RBBM_CTL, 0x00030000);
+
+	/* Turn on hang detection - this spews a lot of useful information
+	 * into the RBBM registers on a hang:
+	 */
+	gpu_write(gpu, REG_A3XX_RBBM_INTERFACE_HANG_INT_CTL, 0x00010fff);
+
+	/* Enable 64-byte cacheline size. HW Default is 32-byte (0x000000E0): */
+	gpu_write(gpu, REG_A3XX_UCHE_CACHE_MODE_CONTROL_REG, 0x00000001);
+
+	/* Enable Clock gating: */
+	gpu_write(gpu, REG_A3XX_RBBM_CLOCK_CTL, 0xbfffffff);
+
+	/* Set the OCMEM base address for A330 */
+//TODO:
+//	if (adreno_is_a330(adreno_gpu)) {
+//		gpu_write(gpu, REG_A3XX_RB_GMEM_BASE_ADDR,
+//			(unsigned int)(a3xx_gpu->ocmem_base >> 14));
+//	}
+
+	/* Turn on performance counters: */
+	gpu_write(gpu, REG_A3XX_RBBM_PERFCTR_CTL, 0x01);
+
+	/* Set SP perfcounter 7 to count SP_FS_FULL_ALU_INSTRUCTIONS
+	 * we will use this to augment our hang detection:
+	 */
+	gpu_write(gpu, REG_A3XX_SP_PERFCOUNTER7_SELECT,
+			SP_FS_FULL_ALU_INSTRUCTIONS);
+
+	gpu_write(gpu, REG_A3XX_RBBM_INT_0_MASK, A3XX_INT0_MASK);
+
+	ret = adreno_hw_init(gpu);
+	if (ret)
+		return ret;
+
+	/* setup access protection: */
+	gpu_write(gpu, REG_A3XX_CP_PROTECT_CTRL, 0x00000007);
+
+	/* RBBM registers */
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(0), 0x63000040);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(1), 0x62000080);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(2), 0x600000cc);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(3), 0x60000108);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(4), 0x64000140);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(5), 0x66000400);
+
+	/* CP registers */
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(6), 0x65000700);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(7), 0x610007d8);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(8), 0x620007e0);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(9), 0x61001178);
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(10), 0x64001180);
+
+	/* RB registers */
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(11), 0x60003300);
+
+	/* VBIF registers */
+	gpu_write(gpu, REG_A3XX_CP_PROTECT(12), 0x6b00c000);
+
+	/* NOTE: PM4/micro-engine firmware registers look to be the same
+	 * for a2xx and a3xx.. we could possibly push that part down to
+	 * adreno_gpu base class.  Or push both PM4 and PFP but
+	 * parameterize the pfp ucode addr/data registers..
+	 */
+
+	/* Load PM4: */
+	ptr = (uint32_t *)(adreno_gpu->pm4->data);
+	len = adreno_gpu->pm4->size / 4;
+	DBG("loading PM4 ucode version: %u", ptr[0]);
+
+	gpu_write(gpu, REG_AXXX_CP_DEBUG,
+			AXXX_CP_DEBUG_DYNAMIC_CLK_DISABLE |
+			AXXX_CP_DEBUG_MIU_128BIT_WRITE_ENABLE);
+	gpu_write(gpu, REG_AXXX_CP_ME_RAM_WADDR, 0);
+	for (i = 1; i < len; i++)
+		gpu_write(gpu, REG_AXXX_CP_ME_RAM_DATA, ptr[i]);
+
+	/* Load PFP: */
+	ptr = (uint32_t *)(adreno_gpu->pfp->data);
+	len = adreno_gpu->pfp->size / 4;
+	DBG("loading PFP ucode version: %u", ptr[0]);
+
+	gpu_write(gpu, REG_A3XX_CP_PFP_UCODE_ADDR, 0);
+	for (i = 1; i < len; i++)
+		gpu_write(gpu, REG_A3XX_CP_PFP_UCODE_DATA, ptr[i]);
+
+	/* CP ROQ queue sizes (bytes) - RB:16, ST:16, IB1:32, IB2:64 */
+	if (adreno_is_a305(adreno_gpu) || adreno_is_a320(adreno_gpu))
+		gpu_write(gpu, REG_AXXX_CP_QUEUE_THRESHOLDS,
+				AXXX_CP_QUEUE_THRESHOLDS_CSQ_IB1_START(2) |
+				AXXX_CP_QUEUE_THRESHOLDS_CSQ_IB2_START(6) |
+				AXXX_CP_QUEUE_THRESHOLDS_CSQ_ST_START(14));
+
+
+	/* clear ME_HALT to start micro engine */
+	gpu_write(gpu, REG_AXXX_CP_ME_CNTL, 0);
+
+	a3xx_me_init(gpu);
+
+	return 0;
+}
+
+static void a3xx_destroy(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a3xx_gpu *a3xx_gpu = to_a3xx_gpu(adreno_gpu);
+
+	DBG("%s", gpu->name);
+
+	adreno_gpu_cleanup(adreno_gpu);
+	put_device(&a3xx_gpu->pdev->dev);
+	kfree(a3xx_gpu);
+}
+
+static void a3xx_idle(struct msm_gpu *gpu)
+{
+	unsigned long t;
+
+	/* wait for ringbuffer to drain: */
+	adreno_idle(gpu);
+
+	t = jiffies + ADRENO_IDLE_TIMEOUT;
+
+	/* then wait for GPU to finish: */
+	do {
+		uint32_t rbbm_status = gpu_read(gpu, REG_A3XX_RBBM_STATUS);
+		if (!(rbbm_status & A3XX_RBBM_STATUS_GPU_BUSY))
+			return;
+	} while(time_before(jiffies, t));
+
+	DRM_ERROR("timeout waiting for %s to idle!\n", gpu->name);
+
+	/* TODO maybe we need to reset GPU here to recover from hang? */
+}
+
+static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
+{
+	uint32_t status;
+
+	status = gpu_read(gpu, REG_A3XX_RBBM_INT_0_STATUS);
+	DBG("%s: %08x", gpu->name, status);
+
+	// TODO
+
+	gpu_write(gpu, REG_A3XX_RBBM_INT_CLEAR_CMD, status);
+
+	msm_gpu_retire(gpu);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static const unsigned int a3xx_registers[] = {
+	0x0000, 0x0002, 0x0010, 0x0012, 0x0018, 0x0018, 0x0020, 0x0027,
+	0x0029, 0x002b, 0x002e, 0x0033, 0x0040, 0x0042, 0x0050, 0x005c,
+	0x0060, 0x006c, 0x0080, 0x0082, 0x0084, 0x0088, 0x0090, 0x00e5,
+	0x00ea, 0x00ed, 0x0100, 0x0100, 0x0110, 0x0123, 0x01c0, 0x01c1,
+	0x01c3, 0x01c5, 0x01c7, 0x01c7, 0x01d5, 0x01d9, 0x01dc, 0x01dd,
+	0x01ea, 0x01ea, 0x01ee, 0x01f1, 0x01f5, 0x01f5, 0x01fc, 0x01ff,
+	0x0440, 0x0440, 0x0443, 0x0443, 0x0445, 0x0445, 0x044d, 0x044f,
+	0x0452, 0x0452, 0x0454, 0x046f, 0x047c, 0x047c, 0x047f, 0x047f,
+	0x0578, 0x057f, 0x0600, 0x0602, 0x0605, 0x0607, 0x060a, 0x060e,
+	0x0612, 0x0614, 0x0c01, 0x0c02, 0x0c06, 0x0c1d, 0x0c3d, 0x0c3f,
+	0x0c48, 0x0c4b, 0x0c80, 0x0c80, 0x0c88, 0x0c8b, 0x0ca0, 0x0cb7,
+	0x0cc0, 0x0cc1, 0x0cc6, 0x0cc7, 0x0ce4, 0x0ce5, 0x0e00, 0x0e05,
+	0x0e0c, 0x0e0c, 0x0e22, 0x0e23, 0x0e41, 0x0e45, 0x0e64, 0x0e65,
+	0x0e80, 0x0e82, 0x0e84, 0x0e89, 0x0ea0, 0x0ea1, 0x0ea4, 0x0ea7,
+	0x0ec4, 0x0ecb, 0x0ee0, 0x0ee0, 0x0f00, 0x0f01, 0x0f03, 0x0f09,
+	0x2040, 0x2040, 0x2044, 0x2044, 0x2048, 0x204d, 0x2068, 0x2069,
+	0x206c, 0x206d, 0x2070, 0x2070, 0x2072, 0x2072, 0x2074, 0x2075,
+	0x2079, 0x207a, 0x20c0, 0x20d3, 0x20e4, 0x20ef, 0x2100, 0x2109,
+	0x210c, 0x210c, 0x210e, 0x210e, 0x2110, 0x2111, 0x2114, 0x2115,
+	0x21e4, 0x21e4, 0x21ea, 0x21ea, 0x21ec, 0x21ed, 0x21f0, 0x21f0,
+	0x2200, 0x2212, 0x2214, 0x2217, 0x221a, 0x221a, 0x2240, 0x227e,
+	0x2280, 0x228b, 0x22c0, 0x22c0, 0x22c4, 0x22ce, 0x22d0, 0x22d8,
+	0x22df, 0x22e6, 0x22e8, 0x22e9, 0x22ec, 0x22ec, 0x22f0, 0x22f7,
+	0x22ff, 0x22ff, 0x2340, 0x2343, 0x2348, 0x2349, 0x2350, 0x2356,
+	0x2360, 0x2360, 0x2440, 0x2440, 0x2444, 0x2444, 0x2448, 0x244d,
+	0x2468, 0x2469, 0x246c, 0x246d, 0x2470, 0x2470, 0x2472, 0x2472,
+	0x2474, 0x2475, 0x2479, 0x247a, 0x24c0, 0x24d3, 0x24e4, 0x24ef,
+	0x2500, 0x2509, 0x250c, 0x250c, 0x250e, 0x250e, 0x2510, 0x2511,
+	0x2514, 0x2515, 0x25e4, 0x25e4, 0x25ea, 0x25ea, 0x25ec, 0x25ed,
+	0x25f0, 0x25f0, 0x2600, 0x2612, 0x2614, 0x2617, 0x261a, 0x261a,
+	0x2640, 0x267e, 0x2680, 0x268b, 0x26c0, 0x26c0, 0x26c4, 0x26ce,
+	0x26d0, 0x26d8, 0x26df, 0x26e6, 0x26e8, 0x26e9, 0x26ec, 0x26ec,
+	0x26f0, 0x26f7, 0x26ff, 0x26ff, 0x2740, 0x2743, 0x2748, 0x2749,
+	0x2750, 0x2756, 0x2760, 0x2760, 0x300c, 0x300e, 0x301c, 0x301d,
+	0x302a, 0x302a, 0x302c, 0x302d, 0x3030, 0x3031, 0x3034, 0x3036,
+	0x303c, 0x303c, 0x305e, 0x305f,
+};
+
+static void a3xx_show(struct msm_gpu *gpu, struct seq_file *m)
+{
+	int i;
+
+	adreno_show(gpu, m);
+	seq_printf(m, "status:   %08x\n",
+			gpu_read(gpu, REG_A3XX_RBBM_STATUS));
+
+	/* dump these out in a form that can be parsed by demsm: */
+	seq_printf(m, "IO:region %s 00000000 00020000\n", gpu->name);
+	for (i = 0; i < ARRAY_SIZE(a3xx_registers); i += 2) {
+		uint32_t start = a3xx_registers[i];
+		uint32_t end   = a3xx_registers[i+1];
+		uint32_t addr;
+
+		for (addr = start; addr <= end; addr++) {
+			uint32_t val = gpu_read(gpu, addr);
+			seq_printf(m, "IO:R %08x %08x\n", addr<<2, val);
+		}
+	}
+}
+#endif
+
+static const struct adreno_gpu_funcs funcs = {
+	.base = {
+		.get_param = adreno_get_param,
+		.hw_init = a3xx_hw_init,
+		.pm_suspend = msm_gpu_pm_suspend,
+		.pm_resume = msm_gpu_pm_resume,
+		.last_fence = adreno_last_fence,
+		.submit = adreno_submit,
+		.flush = adreno_flush,
+		.idle = a3xx_idle,
+		.irq = a3xx_irq,
+		.destroy = a3xx_destroy,
+#ifdef CONFIG_DEBUG_FS
+		.show = a3xx_show,
+#endif
+	},
+};
+
+struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
+{
+	struct a3xx_gpu *a3xx_gpu = NULL;
+	struct msm_gpu *gpu;
+	struct platform_device *pdev = a3xx_pdev;
+	struct adreno_platform_config *config;
+	int ret;
+
+	if (!pdev) {
+		dev_err(dev->dev, "no a3xx device\n");
+		ret = -ENXIO;
+		goto fail;
+	}
+
+	config = pdev->dev.platform_data;
+
+	a3xx_gpu = kzalloc(sizeof(*a3xx_gpu), GFP_KERNEL);
+	if (!a3xx_gpu) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	gpu = &a3xx_gpu->base.base;
+
+	get_device(&pdev->dev);
+	a3xx_gpu->pdev = pdev;
+
+	gpu->fast_rate = config->fast_rate;
+	gpu->slow_rate = config->slow_rate;
+	gpu->bus_freq  = config->bus_freq;
+
+	DBG("fast_rate=%u, slow_rate=%u, bus_freq=%u",
+			gpu->fast_rate, gpu->slow_rate, gpu->bus_freq);
+
+	ret = adreno_gpu_init(dev, pdev, &a3xx_gpu->base,
+			&funcs, config->rev);
+	if (ret)
+		goto fail;
+
+	return &a3xx_gpu->base.base;
+
+fail:
+	if (a3xx_gpu)
+		a3xx_destroy(&a3xx_gpu->base.base);
+
+	return ERR_PTR(ret);
+}
+
+/*
+ * The a3xx device:
+ */
+
+static int a3xx_probe(struct platform_device *pdev)
+{
+	static struct adreno_platform_config config = {};
+#ifdef CONFIG_OF
+	/* TODO */
+#else
+	uint32_t version = socinfo_get_version();
+	if (cpu_is_apq8064ab()) {
+		config.fast_rate = 450000000;
+		config.slow_rate = 27000000;
+		config.bus_freq  = 4;
+		config.rev = ADRENO_REV(3, 2, 1, 0);
+	} else if (cpu_is_apq8064() || cpu_is_msm8960ab()) {
+		config.fast_rate = 400000000;
+		config.slow_rate = 27000000;
+		config.bus_freq  = 4;
+
+		if (SOCINFO_VERSION_MAJOR(version) == 2)
+			config.rev = ADRENO_REV(3, 2, 0, 2);
+		else if ((SOCINFO_VERSION_MAJOR(version) == 1) &&
+				(SOCINFO_VERSION_MINOR(version) == 1))
+			config.rev = ADRENO_REV(3, 2, 0, 1);
+		else
+			config.rev = ADRENO_REV(3, 2, 0, 0);
+
+	} else if (cpu_is_msm8930()) {
+		config.fast_rate = 400000000;
+		config.slow_rate = 27000000;
+		config.bus_freq  = 3;
+
+		if ((SOCINFO_VERSION_MAJOR(version) == 1) &&
+			(SOCINFO_VERSION_MINOR(version) == 2))
+			config.rev = ADRENO_REV(3, 0, 5, 2);
+		else
+			config.rev = ADRENO_REV(3, 0, 5, 0);
+
+	}
+#endif
+	pdev->dev.platform_data = &config;
+	a3xx_pdev = pdev;
+	return 0;
+}
+
+static int a3xx_remove(struct platform_device *pdev)
+{
+	a3xx_pdev = NULL;
+	return 0;
+}
+
+static struct platform_driver a3xx_driver = {
+	.probe = a3xx_probe,
+	.remove = a3xx_remove,
+	.driver.name = "kgsl-3d0",
+};
+
+void __init a3xx_register(void)
+{
+	platform_driver_register(&a3xx_driver);
+}
+
+void __exit a3xx_unregister(void)
+{
+	platform_driver_unregister(&a3xx_driver);
+}
diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.h b/drivers/gpu/drm/msm/adreno/a3xx_gpu.h
new file mode 100644
index 0000000..32c398c
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __A3XX_GPU_H__
+#define __A3XX_GPU_H__
+
+#include "adreno_gpu.h"
+#include "a3xx.xml.h"
+
+struct a3xx_gpu {
+	struct adreno_gpu base;
+	struct platform_device *pdev;
+};
+#define to_a3xx_gpu(x) container_of(x, struct a3xx_gpu, base)
+
+#endif /* __A3XX_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
new file mode 100644
index 0000000..282163e
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "adreno_gpu.h"
+#include "msm_gem.h"
+
+struct adreno_info {
+	struct adreno_rev rev;
+	uint32_t revn;
+	const char *name;
+	const char *pm4fw, *pfpfw;
+	uint32_t gmem;
+};
+
+#define ANY_ID 0xff
+
+static const struct adreno_info gpulist[] = {
+	{
+		.rev   = ADRENO_REV(3, 0, 5, ANY_ID),
+		.revn  = 305,
+		.name  = "A305",
+		.pm4fw = "a300_pm4.fw",
+		.pfpfw = "a300_pfp.fw",
+		.gmem  = SZ_256K,
+	}, {
+		.rev   = ADRENO_REV(3, 2, ANY_ID, ANY_ID),
+		.revn  = 320,
+		.name  = "A320",
+		.pm4fw = "a300_pm4.fw",
+		.pfpfw = "a300_pfp.fw",
+		.gmem  = SZ_512K,
+	}, {
+		.rev   = ADRENO_REV(3, 3, 0, 0),
+		.revn  = 330,
+		.name  = "A330",
+		.pm4fw = "a330_pm4.fw",
+		.pfpfw = "a330_pfp.fw",
+		.gmem  = SZ_1M,
+	},
+};
+
+#define RB_SIZE    SZ_32K
+#define RB_BLKSIZE 16
+
+int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+
+	switch (param) {
+	case MSM_PARAM_GPU_ID:
+		*value = adreno_gpu->info->revn;
+		return 0;
+	case MSM_PARAM_GMEM_SIZE:
+		*value = adreno_gpu->info->gmem;
+		return 0;
+	default:
+		DBG("%s: invalid param: %u", gpu->name, param);
+		return -EINVAL;
+	}
+}
+
+#define rbmemptr(adreno_gpu, member)  \
+	((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member))
+
+int adreno_hw_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+
+	DBG("%s", gpu->name);
+
+	/* Setup REG_CP_RB_CNTL: */
+	gpu_write(gpu, REG_AXXX_CP_RB_CNTL,
+			/* size is log2(quad-words): */
+			AXXX_CP_RB_CNTL_BUFSZ(ilog2(gpu->rb->size / 8)) |
+			AXXX_CP_RB_CNTL_BLKSZ(RB_BLKSIZE));
+
+	/* Setup ringbuffer address: */
+	gpu_write(gpu, REG_AXXX_CP_RB_BASE, gpu->rb_iova);
+	gpu_write(gpu, REG_AXXX_CP_RB_RPTR_ADDR, rbmemptr(adreno_gpu, rptr));
+
+	/* Setup scratch/timestamp: */
+	gpu_write(gpu, REG_AXXX_SCRATCH_ADDR, rbmemptr(adreno_gpu, fence));
+
+	gpu_write(gpu, REG_AXXX_SCRATCH_UMSK, 0x1);
+
+	return 0;
+}
+
+static uint32_t get_wptr(struct msm_ringbuffer *ring)
+{
+	return ring->cur - ring->start;
+}
+
+uint32_t adreno_last_fence(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	return adreno_gpu->memptrs->fence;
+}
+
+int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+		struct msm_file_private *ctx)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_drm_private *priv = gpu->dev->dev_private;
+	struct msm_ringbuffer *ring = gpu->rb;
+	unsigned i, ibs = 0;
+
+	adreno_gpu->last_fence = submit->fence;
+
+	for (i = 0; i < submit->nr_cmds; i++) {
+		switch (submit->cmd[i].type) {
+		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
+			/* ignore IB-targets */
+			break;
+		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
+			/* ignore if there has not been a ctx switch: */
+			if (priv->lastctx == ctx)
+				break;
+		case MSM_SUBMIT_CMD_BUF:
+			OUT_PKT3(ring, CP_INDIRECT_BUFFER_PFD, 2);
+			OUT_RING(ring, submit->cmd[i].iova);
+			OUT_RING(ring, submit->cmd[i].size);
+			ibs++;
+			break;
+		}
+	}
+
+	/* on a320, at least, we seem to need to pad things out to an
+	 * even number of qwords to avoid issue w/ CP hanging on wrap-
+	 * around:
+	 */
+	if (ibs % 2)
+		OUT_PKT2(ring);
+
+	OUT_PKT0(ring, REG_AXXX_CP_SCRATCH_REG2, 1);
+	OUT_RING(ring, submit->fence);
+
+	if (adreno_is_a3xx(adreno_gpu)) {
+		/* Flush HLSQ lazy updates to make sure there is nothing
+		 * pending for indirect loads after the timestamp has
+		 * passed:
+		 */
+		OUT_PKT3(ring, CP_EVENT_WRITE, 1);
+		OUT_RING(ring, HLSQ_FLUSH);
+
+		OUT_PKT3(ring, CP_WAIT_FOR_IDLE, 1);
+		OUT_RING(ring, 0x00000000);
+	}
+
+	OUT_PKT3(ring, CP_EVENT_WRITE, 3);
+	OUT_RING(ring, CACHE_FLUSH_TS);
+	OUT_RING(ring, rbmemptr(adreno_gpu, fence));
+	OUT_RING(ring, submit->fence);
+
+	/* we could maybe be clever and only CP_COND_EXEC the interrupt: */
+	OUT_PKT3(ring, CP_INTERRUPT, 1);
+	OUT_RING(ring, 0x80000000);
+
+#if 0
+	if (adreno_is_a3xx(adreno_gpu)) {
+		/* Dummy set-constant to trigger context rollover */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A3XX_HLSQ_CL_KERNEL_GROUP_X_REG));
+		OUT_RING(ring, 0x00000000);
+	}
+#endif
+
+	gpu->funcs->flush(gpu);
+
+	return 0;
+}
+
+void adreno_flush(struct msm_gpu *gpu)
+{
+	uint32_t wptr = get_wptr(gpu->rb);
+
+	/* ensure writes to ringbuffer have hit system memory: */
+	mb();
+
+	gpu_write(gpu, REG_AXXX_CP_RB_WPTR, wptr);
+}
+
+void adreno_idle(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	uint32_t rptr, wptr = get_wptr(gpu->rb);
+	unsigned long t;
+
+	t = jiffies + ADRENO_IDLE_TIMEOUT;
+
+	/* then wait for CP to drain ringbuffer: */
+	do {
+		rptr = adreno_gpu->memptrs->rptr;
+		if (rptr == wptr)
+			return;
+	} while(time_before(jiffies, t));
+
+	DRM_ERROR("timeout waiting for %s to drain ringbuffer!\n", gpu->name);
+
+	/* TODO maybe we need to reset GPU here to recover from hang? */
+}
+
+#ifdef CONFIG_DEBUG_FS
+void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+
+	seq_printf(m, "revision: %d (%d.%d.%d.%d)\n",
+			adreno_gpu->info->revn, adreno_gpu->rev.core,
+			adreno_gpu->rev.major, adreno_gpu->rev.minor,
+			adreno_gpu->rev.patchid);
+
+	seq_printf(m, "fence:    %d/%d\n", adreno_gpu->memptrs->fence,
+			adreno_gpu->last_fence);
+	seq_printf(m, "rptr:     %d\n", adreno_gpu->memptrs->rptr);
+	seq_printf(m, "wptr:     %d\n", adreno_gpu->memptrs->wptr);
+	seq_printf(m, "rb wptr:  %d\n", get_wptr(gpu->rb));
+}
+#endif
+
+void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	uint32_t freedwords;
+	do {
+		uint32_t size = gpu->rb->size / 4;
+		uint32_t wptr = get_wptr(gpu->rb);
+		uint32_t rptr = adreno_gpu->memptrs->rptr;
+		freedwords = (rptr + (size - 1) - wptr) % size;
+	} while(freedwords < ndwords);
+}
+
+static const char *iommu_ports[] = {
+		"gfx3d_user", "gfx3d_priv",
+		"gfx3d1_user", "gfx3d1_priv",
+};
+
+static inline bool _rev_match(uint8_t entry, uint8_t id)
+{
+	return (entry == ANY_ID) || (entry == id);
+}
+
+int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
+		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs,
+		struct adreno_rev rev)
+{
+	int i, ret;
+
+	/* identify gpu: */
+	for (i = 0; i < ARRAY_SIZE(gpulist); i++) {
+		const struct adreno_info *info = &gpulist[i];
+		if (_rev_match(info->rev.core, rev.core) &&
+				_rev_match(info->rev.major, rev.major) &&
+				_rev_match(info->rev.minor, rev.minor) &&
+				_rev_match(info->rev.patchid, rev.patchid)) {
+			gpu->info = info;
+			gpu->revn = info->revn;
+			break;
+		}
+	}
+
+	if (i == ARRAY_SIZE(gpulist)) {
+		dev_err(drm->dev, "Unknown GPU revision: %u.%u.%u.%u\n",
+				rev.core, rev.major, rev.minor, rev.patchid);
+		return -ENXIO;
+	}
+
+	DBG("Found GPU: %s (%u.%u.%u.%u)", gpu->info->name,
+			rev.core, rev.major, rev.minor, rev.patchid);
+
+	gpu->funcs = funcs;
+	gpu->rev = rev;
+
+	ret = request_firmware(&gpu->pm4, gpu->info->pm4fw, drm->dev);
+	if (ret) {
+		dev_err(drm->dev, "failed to load %s PM4 firmware: %d\n",
+				gpu->info->pm4fw, ret);
+		return ret;
+	}
+
+	ret = request_firmware(&gpu->pfp, gpu->info->pfpfw, drm->dev);
+	if (ret) {
+		dev_err(drm->dev, "failed to load %s PFP firmware: %d\n",
+				gpu->info->pfpfw, ret);
+		return ret;
+	}
+
+	ret = msm_gpu_init(drm, pdev, &gpu->base, &funcs->base,
+			gpu->info->name, "kgsl_3d0_reg_memory", "kgsl_3d0_irq",
+			RB_SIZE);
+	if (ret)
+		return ret;
+
+	ret = msm_iommu_attach(drm, gpu->base.iommu,
+			iommu_ports, ARRAY_SIZE(iommu_ports));
+	if (ret)
+		return ret;
+
+	gpu->memptrs_bo = msm_gem_new(drm, sizeof(*gpu->memptrs),
+			MSM_BO_UNCACHED);
+	if (IS_ERR(gpu->memptrs_bo)) {
+		ret = PTR_ERR(gpu->memptrs_bo);
+		gpu->memptrs_bo = NULL;
+		dev_err(drm->dev, "could not allocate memptrs: %d\n", ret);
+		return ret;
+	}
+
+	gpu->memptrs = msm_gem_vaddr_locked(gpu->memptrs_bo);
+	if (!gpu->memptrs) {
+		dev_err(drm->dev, "could not vmap memptrs\n");
+		return -ENOMEM;
+	}
+
+	ret = msm_gem_get_iova_locked(gpu->memptrs_bo, gpu->base.id,
+			&gpu->memptrs_iova);
+	if (ret) {
+		dev_err(drm->dev, "could not map memptrs: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void adreno_gpu_cleanup(struct adreno_gpu *gpu)
+{
+	if (gpu->memptrs_bo) {
+		if (gpu->memptrs_iova)
+			msm_gem_put_iova(gpu->memptrs_bo, gpu->base.id);
+		drm_gem_object_unreference(gpu->memptrs_bo);
+	}
+	if (gpu->pm4)
+		release_firmware(gpu->pm4);
+	if (gpu->pfp)
+		release_firmware(gpu->pfp);
+	msm_gpu_cleanup(&gpu->base);
+}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
new file mode 100644
index 0000000..6b49c4f
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ADRENO_GPU_H__
+#define __ADRENO_GPU_H__
+
+#include <linux/firmware.h>
+
+#include "msm_gpu.h"
+
+#include "adreno_common.xml.h"
+#include "adreno_pm4.xml.h"
+
+struct adreno_rev {
+	uint8_t  core;
+	uint8_t  major;
+	uint8_t  minor;
+	uint8_t  patchid;
+};
+
+#define ADRENO_REV(core, major, minor, patchid) \
+	((struct adreno_rev){ core, major, minor, patchid })
+
+struct adreno_gpu_funcs {
+	struct msm_gpu_funcs base;
+};
+
+struct adreno_info;
+
+struct adreno_rbmemptrs {
+	volatile uint32_t rptr;
+	volatile uint32_t wptr;
+	volatile uint32_t fence;
+};
+
+struct adreno_gpu {
+	struct msm_gpu base;
+	struct adreno_rev rev;
+	const struct adreno_info *info;
+	uint32_t revn;  /* numeric revision name */
+	const struct adreno_gpu_funcs *funcs;
+
+	uint32_t last_fence;
+
+	/* firmware: */
+	const struct firmware *pm4, *pfp;
+
+	/* ringbuffer rptr/wptr: */
+	// TODO should this be in msm_ringbuffer?  I think it would be
+	// different for z180..
+	struct adreno_rbmemptrs *memptrs;
+	struct drm_gem_object *memptrs_bo;
+	uint32_t memptrs_iova;
+};
+#define to_adreno_gpu(x) container_of(x, struct adreno_gpu, base)
+
+/* platform config data (ie. from DT, or pdata) */
+struct adreno_platform_config {
+	struct adreno_rev rev;
+	uint32_t fast_rate, slow_rate, bus_freq;
+};
+
+#define ADRENO_IDLE_TIMEOUT (20 * 1000)
+
+static inline bool adreno_is_a3xx(struct adreno_gpu *gpu)
+{
+	return (gpu->revn >= 300) && (gpu->revn < 400);
+}
+
+static inline bool adreno_is_a305(struct adreno_gpu *gpu)
+{
+	return gpu->revn == 305;
+}
+
+static inline bool adreno_is_a320(struct adreno_gpu *gpu)
+{
+	return gpu->revn == 320;
+}
+
+static inline bool adreno_is_a330(struct adreno_gpu *gpu)
+{
+	return gpu->revn == 330;
+}
+
+int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
+int adreno_hw_init(struct msm_gpu *gpu);
+uint32_t adreno_last_fence(struct msm_gpu *gpu);
+int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+		struct msm_file_private *ctx);
+void adreno_flush(struct msm_gpu *gpu);
+void adreno_idle(struct msm_gpu *gpu);
+#ifdef CONFIG_DEBUG_FS
+void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
+#endif
+void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords);
+
+int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
+		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs,
+		struct adreno_rev rev);
+void adreno_gpu_cleanup(struct adreno_gpu *gpu);
+
+
+/* ringbuffer helpers (the parts that are adreno specific) */
+
+static inline void
+OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
+{
+	adreno_wait_ring(ring->gpu, cnt+1);
+	OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
+}
+
+/* no-op packet: */
+static inline void
+OUT_PKT2(struct msm_ringbuffer *ring)
+{
+	adreno_wait_ring(ring->gpu, 1);
+	OUT_RING(ring, CP_TYPE2_PKT);
+}
+
+static inline void
+OUT_PKT3(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
+{
+	adreno_wait_ring(ring->gpu, cnt+1);
+	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
+}
+
+
+#endif /* __ADRENO_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index b5ae0db..864c977 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -16,6 +16,7 @@
  */
 
 #include "msm_drv.h"
+#include "msm_gpu.h"
 
 #include <mach/iommu.h>
 
@@ -135,6 +136,7 @@
 {
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_kms *kms = priv->kms;
+	struct msm_gpu *gpu = priv->gpu;
 
 	drm_kms_helper_poll_fini(dev);
 	drm_mode_config_cleanup(dev);
@@ -152,6 +154,12 @@
 		kms->funcs->destroy(kms);
 	}
 
+	if (gpu) {
+		mutex_lock(&dev->struct_mutex);
+		gpu->funcs->pm_suspend(gpu);
+		gpu->funcs->destroy(gpu);
+		mutex_unlock(&dev->struct_mutex);
+	}
 
 	dev->dev_private = NULL;
 
@@ -176,6 +184,7 @@
 	dev->dev_private = priv;
 
 	priv->wq = alloc_ordered_workqueue("msm", 0);
+	init_waitqueue_head(&priv->fence_event);
 
 	INIT_LIST_HEAD(&priv->inactive_list);
 
@@ -240,12 +249,70 @@
 	return ret;
 }
 
+static void load_gpu(struct drm_device *dev)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu;
+
+	if (priv->gpu)
+		return;
+
+	mutex_lock(&dev->struct_mutex);
+	gpu = a3xx_gpu_init(dev);
+	if (IS_ERR(gpu)) {
+		dev_warn(dev->dev, "failed to load a3xx gpu\n");
+		gpu = NULL;
+		/* not fatal */
+	}
+	mutex_unlock(&dev->struct_mutex);
+
+	if (gpu) {
+		int ret;
+		gpu->funcs->pm_resume(gpu);
+		ret = gpu->funcs->hw_init(gpu);
+		if (ret) {
+			dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
+			gpu->funcs->destroy(gpu);
+			gpu = NULL;
+		}
+	}
+
+	priv->gpu = gpu;
+}
+
+static int msm_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct msm_file_private *ctx;
+
+	/* For now, load gpu on open.. to avoid the requirement of having
+	 * firmware in the initrd.
+	 */
+	load_gpu(dev);
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	file->driver_priv = ctx;
+
+	return 0;
+}
+
 static void msm_preclose(struct drm_device *dev, struct drm_file *file)
 {
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_file_private *ctx = file->driver_priv;
 	struct msm_kms *kms = priv->kms;
+
 	if (kms)
 		kms->funcs->preclose(kms, file);
+
+	mutex_lock(&dev->struct_mutex);
+	if (ctx == priv->lastctx)
+		priv->lastctx = NULL;
+	mutex_unlock(&dev->struct_mutex);
+
+	kfree(ctx);
 }
 
 static void msm_lastclose(struct drm_device *dev)
@@ -316,11 +383,30 @@
  */
 
 #ifdef CONFIG_DEBUG_FS
+static int msm_gpu_show(struct drm_device *dev, struct seq_file *m)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+
+	if (gpu) {
+		seq_printf(m, "%s Status:\n", gpu->name);
+		gpu->funcs->show(gpu, m);
+	}
+
+	return 0;
+}
+
 static int msm_gem_show(struct drm_device *dev, struct seq_file *m)
 {
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
 
-	seq_printf(m, "All Objects:\n");
+	if (gpu) {
+		seq_printf(m, "Active Objects (%s):\n", gpu->name);
+		msm_gem_describe_objects(&gpu->active_list, m);
+	}
+
+	seq_printf(m, "Inactive Objects:\n");
 	msm_gem_describe_objects(&priv->inactive_list, m);
 
 	return 0;
@@ -375,6 +461,7 @@
 }
 
 static struct drm_info_list msm_debugfs_list[] = {
+		{"gpu", show_locked, 0, msm_gpu_show},
 		{"gem", show_locked, 0, msm_gem_show},
 		{ "mm", show_locked, 0, msm_mm_show },
 		{ "fb", show_locked, 0, msm_fb_show },
@@ -404,6 +491,158 @@
 }
 #endif
 
+/*
+ * Fences:
+ */
+
+int msm_wait_fence_interruptable(struct drm_device *dev, uint32_t fence,
+		struct timespec *timeout)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	unsigned long timeout_jiffies = timespec_to_jiffies(timeout);
+	unsigned long start_jiffies = jiffies;
+	unsigned long remaining_jiffies;
+	int ret;
+
+	if (time_after(start_jiffies, timeout_jiffies))
+		remaining_jiffies = 0;
+	else
+		remaining_jiffies = timeout_jiffies - start_jiffies;
+
+	ret = wait_event_interruptible_timeout(priv->fence_event,
+			priv->completed_fence >= fence,
+			remaining_jiffies);
+	if (ret == 0) {
+		DBG("timeout waiting for fence: %u (completed: %u)",
+				fence, priv->completed_fence);
+		ret = -ETIMEDOUT;
+	} else if (ret != -ERESTARTSYS) {
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* call under struct_mutex */
+void msm_update_fence(struct drm_device *dev, uint32_t fence)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+
+	if (fence > priv->completed_fence) {
+		priv->completed_fence = fence;
+		wake_up_all(&priv->fence_event);
+	}
+}
+
+/*
+ * DRM ioctls:
+ */
+
+static int msm_ioctl_get_param(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	struct drm_msm_param *args = data;
+	struct msm_gpu *gpu;
+
+	/* for now, we just have 3d pipe.. eventually this would need to
+	 * be more clever to dispatch to appropriate gpu module:
+	 */
+	if (args->pipe != MSM_PIPE_3D0)
+		return -EINVAL;
+
+	gpu = priv->gpu;
+
+	if (!gpu)
+		return -ENXIO;
+
+	return gpu->funcs->get_param(gpu, args->param, &args->value);
+}
+
+static int msm_ioctl_gem_new(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_new *args = data;
+	return msm_gem_new_handle(dev, file, args->size,
+			args->flags, &args->handle);
+}
+
+#define TS(t) ((struct timespec){ .tv_sec = (t).tv_sec, .tv_nsec = (t).tv_nsec })
+
+static int msm_ioctl_gem_cpu_prep(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_cpu_prep *args = data;
+	struct drm_gem_object *obj;
+	int ret;
+
+	obj = drm_gem_object_lookup(dev, file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	ret = msm_gem_cpu_prep(obj, args->op, &TS(args->timeout));
+
+	drm_gem_object_unreference_unlocked(obj);
+
+	return ret;
+}
+
+static int msm_ioctl_gem_cpu_fini(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_cpu_fini *args = data;
+	struct drm_gem_object *obj;
+	int ret;
+
+	obj = drm_gem_object_lookup(dev, file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	ret = msm_gem_cpu_fini(obj);
+
+	drm_gem_object_unreference_unlocked(obj);
+
+	return ret;
+}
+
+static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_gem_info *args = data;
+	struct drm_gem_object *obj;
+	int ret = 0;
+
+	if (args->pad)
+		return -EINVAL;
+
+	obj = drm_gem_object_lookup(dev, file, args->handle);
+	if (!obj)
+		return -ENOENT;
+
+	args->offset = msm_gem_mmap_offset(obj);
+
+	drm_gem_object_unreference_unlocked(obj);
+
+	return ret;
+}
+
+static int msm_ioctl_wait_fence(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct drm_msm_wait_fence *args = data;
+	return msm_wait_fence_interruptable(dev, args->fence, &TS(args->timeout));
+}
+
+static const struct drm_ioctl_desc msm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(MSM_GET_PARAM,    msm_ioctl_get_param,    DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_NEW,      msm_ioctl_gem_new,      DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_INFO,     msm_ioctl_gem_info,     DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_CPU_PREP, msm_ioctl_gem_cpu_prep, DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_CPU_FINI, msm_ioctl_gem_cpu_fini, DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_GEM_SUBMIT,   msm_ioctl_gem_submit,   DRM_UNLOCKED|DRM_AUTH),
+	DRM_IOCTL_DEF_DRV(MSM_WAIT_FENCE,   msm_ioctl_wait_fence,   DRM_UNLOCKED|DRM_AUTH),
+};
+
 static const struct vm_operations_struct vm_ops = {
 	.fault = msm_gem_fault,
 	.open = drm_gem_vm_open,
@@ -428,6 +667,7 @@
 	.driver_features    = DRIVER_HAVE_IRQ | DRIVER_GEM | DRIVER_MODESET,
 	.load               = msm_load,
 	.unload             = msm_unload,
+	.open               = msm_open,
 	.preclose           = msm_preclose,
 	.lastclose          = msm_lastclose,
 	.irq_handler        = msm_irq,
@@ -446,6 +686,8 @@
 	.debugfs_init       = msm_debugfs_init,
 	.debugfs_cleanup    = msm_debugfs_cleanup,
 #endif
+	.ioctls             = msm_ioctls,
+	.num_ioctls         = DRM_MSM_NUM_IOCTLS,
 	.fops               = &fops,
 	.name               = "msm",
 	.desc               = "MSM Snapdragon DRM",
@@ -514,6 +756,7 @@
 {
 	DBG("init");
 	hdmi_register();
+	a3xx_register();
 	return platform_driver_register(&msm_platform_driver);
 }
 
@@ -522,6 +765,7 @@
 	DBG("fini");
 	platform_driver_unregister(&msm_platform_driver);
 	hdmi_unregister();
+	a3xx_unregister();
 }
 
 module_init(msm_drm_register);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 36f8ba2..34c36b2 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -40,17 +40,34 @@
 #include <drm/drmP.h>
 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_fb_helper.h>
+#include <drm/msm_drm.h>
 
 struct msm_kms;
+struct msm_gpu;
 
-#define NUM_DOMAINS 1    /* one for KMS, then one per gpu core (?) */
+#define NUM_DOMAINS 2    /* one for KMS, then one per gpu core (?) */
+
+struct msm_file_private {
+	/* currently we don't do anything useful with this.. but when
+	 * per-context address spaces are supported we'd keep track of
+	 * the context's page-tables here.
+	 */
+	int dummy;
+};
 
 struct msm_drm_private {
 
 	struct msm_kms *kms;
 
+	/* when we have more than one 'msm_gpu' these need to be an array: */
+	struct msm_gpu *gpu;
+	struct msm_file_private *lastctx;
+
 	struct drm_fb_helper *fbdev;
 
+	uint32_t next_fence, completed_fence;
+	wait_queue_head_t fence_event;
+
 	/* list of GEM objects: */
 	struct list_head inactive_list;
 
@@ -108,6 +125,13 @@
 int msm_iommu_attach(struct drm_device *dev, struct iommu_domain *iommu,
 		const char **names, int cnt);
 
+int msm_wait_fence_interruptable(struct drm_device *dev, uint32_t fence,
+		struct timespec *timeout);
+void msm_update_fence(struct drm_device *dev, uint32_t fence);
+
+int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
+		struct drm_file *file);
+
 int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
@@ -125,6 +149,12 @@
 void *msm_gem_vaddr(struct drm_gem_object *obj);
 int msm_gem_queue_inactive_work(struct drm_gem_object *obj,
 		struct work_struct *work);
+void msm_gem_move_to_active(struct drm_gem_object *obj,
+		struct msm_gpu *gpu, uint32_t fence);
+void msm_gem_move_to_inactive(struct drm_gem_object *obj);
+int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op,
+		struct timespec *timeout);
+int msm_gem_cpu_fini(struct drm_gem_object *obj);
 void msm_gem_free_object(struct drm_gem_object *obj);
 int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file,
 		uint32_t size, uint32_t flags, uint32_t *handle);
@@ -168,20 +198,14 @@
 
 /* for the generated headers: */
 #define INVALID_IDX(idx) ({BUG(); 0;})
+#define fui(x)                ({BUG(); 0;})
+#define util_float_to_half(x) ({BUG(); 0;})
+
 
 #define FIELD(val, name) (((val) & name ## __MASK) >> name ## __SHIFT)
 
 /* for conditionally setting boolean flag(s): */
 #define COND(bool, val) ((bool) ? (val) : 0)
 
-/* just put these here until we start adding driver private ioctls: */
-// TODO might shuffle these around.. just need something for now..
-#define MSM_BO_CACHE_MASK	0x0000000f
-#define MSM_BO_SCANOUT		0x00010000	/* scanout capable */
-
-#define MSM_BO_CACHED		0x00000001	/* default */
-#define MSM_BO_WC		0x0000002
-#define MSM_BO_UNCACHED	0x00000004
-
 
 #endif /* __MSM_DRV_H__ */
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index a52e6cc..6b5a6c8 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -20,6 +20,7 @@
 
 #include "msm_drv.h"
 #include "msm_gem.h"
+#include "msm_gpu.h"
 
 
 /* called with dev->struct_mutex held */
@@ -375,10 +376,74 @@
 {
 	struct drm_device *dev = obj->dev;
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+	int ret = 0;
 
-	/* just a place-holder until we have gpu.. */
-	queue_work(priv->wq, work);
+	mutex_lock(&dev->struct_mutex);
+	if (!list_empty(&work->entry)) {
+		ret = -EINVAL;
+	} else if (is_active(msm_obj)) {
+		list_add_tail(&work->entry, &msm_obj->inactive_work);
+	} else {
+		queue_work(priv->wq, work);
+	}
+	mutex_unlock(&dev->struct_mutex);
 
+	return ret;
+}
+
+void msm_gem_move_to_active(struct drm_gem_object *obj,
+		struct msm_gpu *gpu, uint32_t fence)
+{
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+	msm_obj->gpu = gpu;
+	msm_obj->fence = fence;
+	list_del_init(&msm_obj->mm_list);
+	list_add_tail(&msm_obj->mm_list, &gpu->active_list);
+}
+
+void msm_gem_move_to_inactive(struct drm_gem_object *obj)
+{
+	struct drm_device *dev = obj->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+
+	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	msm_obj->gpu = NULL;
+	msm_obj->fence = 0;
+	list_del_init(&msm_obj->mm_list);
+	list_add_tail(&msm_obj->mm_list, &priv->inactive_list);
+
+	while (!list_empty(&msm_obj->inactive_work)) {
+		struct work_struct *work;
+
+		work = list_first_entry(&msm_obj->inactive_work,
+				struct work_struct, entry);
+
+		list_del_init(&work->entry);
+		queue_work(priv->wq, work);
+	}
+}
+
+int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op,
+		struct timespec *timeout)
+{
+	struct drm_device *dev = obj->dev;
+	struct msm_gem_object *msm_obj = to_msm_bo(obj);
+	int ret = 0;
+
+	if (is_active(msm_obj) && !(op & MSM_PREP_NOSYNC))
+		ret = msm_wait_fence_interruptable(dev, msm_obj->fence, timeout);
+
+	/* TODO cache maintenance */
+
+	return ret;
+}
+
+int msm_gem_cpu_fini(struct drm_gem_object *obj)
+{
+	/* TODO cache maintenance */
 	return 0;
 }
 
@@ -390,8 +455,9 @@
 	uint64_t off = drm_vma_node_start(&obj->vma_node);
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
-	seq_printf(m, "%08x: %2d (%2d) %08llx %p %d\n",
-			msm_obj->flags, obj->name, obj->refcount.refcount.counter,
+	seq_printf(m, "%08x: %c(%d) %2d (%2d) %08llx %p %d\n",
+			msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
+			msm_obj->fence, obj->name, obj->refcount.refcount.counter,
 			off, msm_obj->vaddr, obj->size);
 }
 
@@ -421,6 +487,9 @@
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
+	/* object should not be on active list: */
+	WARN_ON(is_active(msm_obj));
+
 	list_del(&msm_obj->mm_list);
 
 	for (id = 0; id < ARRAY_SIZE(msm_obj->domain); id++) {
@@ -439,6 +508,9 @@
 
 	put_pages(obj);
 
+	if (msm_obj->resv == &msm_obj->_resv)
+		reservation_object_fini(msm_obj->resv);
+
 	drm_gem_object_release(obj);
 
 	kfree(msm_obj);
@@ -508,7 +580,11 @@
 
 	msm_obj->flags = flags;
 
+	msm_obj->resv = &msm_obj->_resv;
+	reservation_object_init(msm_obj->resv);
 
+	INIT_LIST_HEAD(&msm_obj->submit_entry);
+	INIT_LIST_HEAD(&msm_obj->inactive_work);
 	list_add_tail(&msm_obj->mm_list, &priv->inactive_list);
 
 	return obj;
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index fcafd19..d746f13 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -18,6 +18,7 @@
 #ifndef __MSM_GEM_H__
 #define __MSM_GEM_H__
 
+#include <linux/reservation.h>
 #include "msm_drv.h"
 
 struct msm_gem_object {
@@ -25,7 +26,27 @@
 
 	uint32_t flags;
 
+	/* And object is either:
+	 *  inactive - on priv->inactive_list
+	 *  active   - on one one of the gpu's active_list..  well, at
+	 *     least for now we don't have (I don't think) hw sync between
+	 *     2d and 3d one devices which have both, meaning we need to
+	 *     block on submit if a bo is already on other ring
+	 *
+	 */
 	struct list_head mm_list;
+	struct msm_gpu *gpu;     /* non-null if active */
+	uint32_t fence;
+
+	/* Transiently in the process of submit ioctl, objects associated
+	 * with the submit are on submit->bo_list.. this only lasts for
+	 * the duration of the ioctl, so one bo can never be on multiple
+	 * submit lists.
+	 */
+	struct list_head submit_entry;
+
+	/* work defered until bo is inactive: */
+	struct list_head inactive_work;
 
 	struct page **pages;
 	struct sg_table *sgt;
@@ -35,7 +56,44 @@
 		// XXX
 		uint32_t iova;
 	} domain[NUM_DOMAINS];
+
+	/* normally (resv == &_resv) except for imported bo's */
+	struct reservation_object *resv;
+	struct reservation_object _resv;
 };
 #define to_msm_bo(x) container_of(x, struct msm_gem_object, base)
 
+static inline bool is_active(struct msm_gem_object *msm_obj)
+{
+	return msm_obj->gpu != NULL;
+}
+
+#define MAX_CMDS 4
+
+/* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
+ * associated with the cmdstream submission for synchronization (and
+ * make it easier to unwind when things go wrong, etc).  This only
+ * lasts for the duration of the submit-ioctl.
+ */
+struct msm_gem_submit {
+	struct drm_device *dev;
+	struct msm_gpu *gpu;
+	struct list_head bo_list;
+	struct ww_acquire_ctx ticket;
+	uint32_t fence;
+	bool valid;
+	unsigned int nr_cmds;
+	unsigned int nr_bos;
+	struct {
+		uint32_t type;
+		uint32_t size;  /* in dwords */
+		uint32_t iova;
+	} cmd[MAX_CMDS];
+	struct {
+		uint32_t flags;
+		struct msm_gem_object *obj;
+		uint32_t iova;
+	} bos[0];
+};
+
 #endif /* __MSM_GEM_H__ */
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
new file mode 100644
index 0000000..3e1ef3a
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msm_drv.h"
+#include "msm_gpu.h"
+#include "msm_gem.h"
+
+/*
+ * Cmdstream submission:
+ */
+
+#define BO_INVALID_FLAGS ~(MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE)
+/* make sure these don't conflict w/ MSM_SUBMIT_BO_x */
+#define BO_VALID    0x8000
+#define BO_LOCKED   0x4000
+#define BO_PINNED   0x2000
+
+static inline void __user *to_user_ptr(u64 address)
+{
+	return (void __user *)(uintptr_t)address;
+}
+
+static struct msm_gem_submit *submit_create(struct drm_device *dev,
+		struct msm_gpu *gpu, int nr)
+{
+	struct msm_gem_submit *submit;
+	int sz = sizeof(*submit) + (nr * sizeof(submit->bos[0]));
+
+	submit = kmalloc(sz, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
+	if (submit) {
+		submit->dev = dev;
+		submit->gpu = gpu;
+
+		/* initially, until copy_from_user() and bo lookup succeeds: */
+		submit->nr_bos = 0;
+		submit->nr_cmds = 0;
+
+		INIT_LIST_HEAD(&submit->bo_list);
+		ww_acquire_init(&submit->ticket, &reservation_ww_class);
+	}
+
+	return submit;
+}
+
+static int submit_lookup_objects(struct msm_gem_submit *submit,
+		struct drm_msm_gem_submit *args, struct drm_file *file)
+{
+	unsigned i;
+	int ret = 0;
+
+	spin_lock(&file->table_lock);
+
+	for (i = 0; i < args->nr_bos; i++) {
+		struct drm_msm_gem_submit_bo submit_bo;
+		struct drm_gem_object *obj;
+		struct msm_gem_object *msm_obj;
+		void __user *userptr =
+			to_user_ptr(args->bos + (i * sizeof(submit_bo)));
+
+		ret = copy_from_user(&submit_bo, userptr, sizeof(submit_bo));
+		if (ret) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+
+		if (submit_bo.flags & BO_INVALID_FLAGS) {
+			DBG("invalid flags: %x", submit_bo.flags);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		submit->bos[i].flags = submit_bo.flags;
+		/* in validate_objects() we figure out if this is true: */
+		submit->bos[i].iova  = submit_bo.presumed;
+
+		/* normally use drm_gem_object_lookup(), but for bulk lookup
+		 * all under single table_lock just hit object_idr directly:
+		 */
+		obj = idr_find(&file->object_idr, submit_bo.handle);
+		if (!obj) {
+			DBG("invalid handle %u at index %u", submit_bo.handle, i);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		msm_obj = to_msm_bo(obj);
+
+		if (!list_empty(&msm_obj->submit_entry)) {
+			DBG("handle %u at index %u already on submit list",
+					submit_bo.handle, i);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		drm_gem_object_reference(obj);
+
+		submit->bos[i].obj = msm_obj;
+
+		list_add_tail(&msm_obj->submit_entry, &submit->bo_list);
+	}
+
+out_unlock:
+	submit->nr_bos = i;
+	spin_unlock(&file->table_lock);
+
+	return ret;
+}
+
+static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
+{
+	struct msm_gem_object *msm_obj = submit->bos[i].obj;
+
+	if (submit->bos[i].flags & BO_PINNED)
+		msm_gem_put_iova(&msm_obj->base, submit->gpu->id);
+
+	if (submit->bos[i].flags & BO_LOCKED)
+		ww_mutex_unlock(&msm_obj->resv->lock);
+
+	if (!(submit->bos[i].flags & BO_VALID))
+		submit->bos[i].iova = 0;
+
+	submit->bos[i].flags &= ~(BO_LOCKED | BO_PINNED);
+}
+
+/* This is where we make sure all the bo's are reserved and pin'd: */
+static int submit_validate_objects(struct msm_gem_submit *submit)
+{
+	int contended, slow_locked = -1, i, ret = 0;
+
+retry:
+	submit->valid = true;
+
+	for (i = 0; i < submit->nr_bos; i++) {
+		struct msm_gem_object *msm_obj = submit->bos[i].obj;
+		uint32_t iova;
+
+		if (slow_locked == i)
+			slow_locked = -1;
+
+		contended = i;
+
+		if (!(submit->bos[i].flags & BO_LOCKED)) {
+			ret = ww_mutex_lock_interruptible(&msm_obj->resv->lock,
+					&submit->ticket);
+			if (ret)
+				goto fail;
+			submit->bos[i].flags |= BO_LOCKED;
+		}
+
+
+		/* if locking succeeded, pin bo: */
+		ret = msm_gem_get_iova(&msm_obj->base,
+				submit->gpu->id, &iova);
+
+		/* this would break the logic in the fail path.. there is no
+		 * reason for this to happen, but just to be on the safe side
+		 * let's notice if this starts happening in the future:
+		 */
+		WARN_ON(ret == -EDEADLK);
+
+		if (ret)
+			goto fail;
+
+		submit->bos[i].flags |= BO_PINNED;
+
+		if (iova == submit->bos[i].iova) {
+			submit->bos[i].flags |= BO_VALID;
+		} else {
+			submit->bos[i].iova = iova;
+			submit->bos[i].flags &= ~BO_VALID;
+			submit->valid = false;
+		}
+	}
+
+	ww_acquire_done(&submit->ticket);
+
+	return 0;
+
+fail:
+	for (; i >= 0; i--)
+		submit_unlock_unpin_bo(submit, i);
+
+	if (slow_locked > 0)
+		submit_unlock_unpin_bo(submit, slow_locked);
+
+	if (ret == -EDEADLK) {
+		struct msm_gem_object *msm_obj = submit->bos[contended].obj;
+		/* we lost out in a seqno race, lock and retry.. */
+		ret = ww_mutex_lock_slow_interruptible(&msm_obj->resv->lock,
+				&submit->ticket);
+		if (!ret) {
+			submit->bos[contended].flags |= BO_LOCKED;
+			slow_locked = contended;
+			goto retry;
+		}
+	}
+
+	return ret;
+}
+
+static int submit_bo(struct msm_gem_submit *submit, uint32_t idx,
+		struct msm_gem_object **obj, uint32_t *iova, bool *valid)
+{
+	if (idx >= submit->nr_bos) {
+		DBG("invalid buffer index: %u (out of %u)", idx, submit->nr_bos);
+		return EINVAL;
+	}
+
+	if (obj)
+		*obj = submit->bos[idx].obj;
+	if (iova)
+		*iova = submit->bos[idx].iova;
+	if (valid)
+		*valid = !!(submit->bos[idx].flags & BO_VALID);
+
+	return 0;
+}
+
+/* process the reloc's and patch up the cmdstream as needed: */
+static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *obj,
+		uint32_t offset, uint32_t nr_relocs, uint64_t relocs)
+{
+	uint32_t i, last_offset = 0;
+	uint32_t *ptr;
+	int ret;
+
+	if (offset % 4) {
+		DBG("non-aligned cmdstream buffer: %u", offset);
+		return -EINVAL;
+	}
+
+	/* For now, just map the entire thing.  Eventually we probably
+	 * to do it page-by-page, w/ kmap() if not vmap()d..
+	 */
+	ptr = msm_gem_vaddr(&obj->base);
+
+	if (IS_ERR(ptr)) {
+		ret = PTR_ERR(ptr);
+		DBG("failed to map: %d", ret);
+		return ret;
+	}
+
+	for (i = 0; i < nr_relocs; i++) {
+		struct drm_msm_gem_submit_reloc submit_reloc;
+		void __user *userptr =
+			to_user_ptr(relocs + (i * sizeof(submit_reloc)));
+		uint32_t iova, off;
+		bool valid;
+
+		ret = copy_from_user(&submit_reloc, userptr, sizeof(submit_reloc));
+		if (ret)
+			return -EFAULT;
+
+		if (submit_reloc.submit_offset % 4) {
+			DBG("non-aligned reloc offset: %u",
+					submit_reloc.submit_offset);
+			return -EINVAL;
+		}
+
+		/* offset in dwords: */
+		off = submit_reloc.submit_offset / 4;
+
+		if ((off >= (obj->base.size / 4)) ||
+				(off < last_offset)) {
+			DBG("invalid offset %u at reloc %u", off, i);
+			return -EINVAL;
+		}
+
+		ret = submit_bo(submit, submit_reloc.reloc_idx, NULL, &iova, &valid);
+		if (ret)
+			return ret;
+
+		if (valid)
+			continue;
+
+		iova += submit_reloc.reloc_offset;
+
+		if (submit_reloc.shift < 0)
+			iova >>= -submit_reloc.shift;
+		else
+			iova <<= submit_reloc.shift;
+
+		ptr[off] = iova | submit_reloc.or;
+
+		last_offset = off;
+	}
+
+	return 0;
+}
+
+static void submit_cleanup(struct msm_gem_submit *submit, bool fail)
+{
+	unsigned i;
+
+	mutex_lock(&submit->dev->struct_mutex);
+	for (i = 0; i < submit->nr_bos; i++) {
+		struct msm_gem_object *msm_obj = submit->bos[i].obj;
+		submit_unlock_unpin_bo(submit, i);
+		list_del_init(&msm_obj->submit_entry);
+		drm_gem_object_unreference(&msm_obj->base);
+	}
+	mutex_unlock(&submit->dev->struct_mutex);
+
+	ww_acquire_fini(&submit->ticket);
+	kfree(submit);
+}
+
+int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
+		struct drm_file *file)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	struct drm_msm_gem_submit *args = data;
+	struct msm_file_private *ctx = file->driver_priv;
+	struct msm_gem_submit *submit;
+	struct msm_gpu *gpu;
+	unsigned i;
+	int ret;
+
+	/* for now, we just have 3d pipe.. eventually this would need to
+	 * be more clever to dispatch to appropriate gpu module:
+	 */
+	if (args->pipe != MSM_PIPE_3D0)
+		return -EINVAL;
+
+	gpu = priv->gpu;
+
+	if (args->nr_cmds > MAX_CMDS)
+		return -EINVAL;
+
+	submit = submit_create(dev, gpu, args->nr_bos);
+	if (!submit) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = submit_lookup_objects(submit, args, file);
+	if (ret)
+		goto out;
+
+	ret = submit_validate_objects(submit);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < args->nr_cmds; i++) {
+		struct drm_msm_gem_submit_cmd submit_cmd;
+		void __user *userptr =
+			to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
+		struct msm_gem_object *msm_obj;
+		uint32_t iova;
+
+		ret = copy_from_user(&submit_cmd, userptr, sizeof(submit_cmd));
+		if (ret) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		ret = submit_bo(submit, submit_cmd.submit_idx,
+				&msm_obj, &iova, NULL);
+		if (ret)
+			goto out;
+
+		if (submit_cmd.size % 4) {
+			DBG("non-aligned cmdstream buffer size: %u",
+					submit_cmd.size);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (submit_cmd.size >= msm_obj->base.size) {
+			DBG("invalid cmdstream size: %u", submit_cmd.size);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		submit->cmd[i].type = submit_cmd.type;
+		submit->cmd[i].size = submit_cmd.size / 4;
+		submit->cmd[i].iova = iova + submit_cmd.submit_offset;
+
+		if (submit->valid)
+			continue;
+
+		ret = submit_reloc(submit, msm_obj, submit_cmd.submit_offset,
+				submit_cmd.nr_relocs, submit_cmd.relocs);
+		if (ret)
+			goto out;
+	}
+
+	submit->nr_cmds = i;
+
+	ret = msm_gpu_submit(gpu, submit, ctx);
+
+	args->fence = submit->fence;
+
+out:
+	if (submit)
+		submit_cleanup(submit, !!ret);
+	return ret;
+}
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
new file mode 100644
index 0000000..7c6541e
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+
+
+/*
+ * Power Management:
+ */
+
+#ifdef CONFIG_MSM_BUS_SCALING
+#include <mach/board.h>
+#include <mach/kgsl.h>
+static void bs_init(struct msm_gpu *gpu, struct platform_device *pdev)
+{
+	struct drm_device *dev = gpu->dev;
+	struct kgsl_device_platform_data *pdata = pdev->dev.platform_data;
+
+	if (!pdev) {
+		dev_err(dev->dev, "could not find dtv pdata\n");
+		return;
+	}
+
+	if (pdata->bus_scale_table) {
+		gpu->bsc = msm_bus_scale_register_client(pdata->bus_scale_table);
+		DBG("bus scale client: %08x", gpu->bsc);
+	}
+}
+
+static void bs_fini(struct msm_gpu *gpu)
+{
+	if (gpu->bsc) {
+		msm_bus_scale_unregister_client(gpu->bsc);
+		gpu->bsc = 0;
+	}
+}
+
+static void bs_set(struct msm_gpu *gpu, int idx)
+{
+	if (gpu->bsc) {
+		DBG("set bus scaling: %d", idx);
+		msm_bus_scale_client_update_request(gpu->bsc, idx);
+	}
+}
+#else
+static void bs_init(struct msm_gpu *gpu, struct platform_device *pdev) {}
+static void bs_fini(struct msm_gpu *gpu) {}
+static void bs_set(struct msm_gpu *gpu, int idx) {}
+#endif
+
+static int enable_pwrrail(struct msm_gpu *gpu)
+{
+	struct drm_device *dev = gpu->dev;
+	int ret = 0;
+
+	if (gpu->gpu_reg) {
+		ret = regulator_enable(gpu->gpu_reg);
+		if (ret) {
+			dev_err(dev->dev, "failed to enable 'gpu_reg': %d\n", ret);
+			return ret;
+		}
+	}
+
+	if (gpu->gpu_cx) {
+		ret = regulator_enable(gpu->gpu_cx);
+		if (ret) {
+			dev_err(dev->dev, "failed to enable 'gpu_cx': %d\n", ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int disable_pwrrail(struct msm_gpu *gpu)
+{
+	if (gpu->gpu_cx)
+		regulator_disable(gpu->gpu_cx);
+	if (gpu->gpu_reg)
+		regulator_disable(gpu->gpu_reg);
+	return 0;
+}
+
+static int enable_clk(struct msm_gpu *gpu)
+{
+	struct clk *rate_clk = NULL;
+	int i;
+
+	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) {
+		if (gpu->grp_clks[i]) {
+			clk_prepare(gpu->grp_clks[i]);
+			rate_clk = gpu->grp_clks[i];
+		}
+	}
+
+	if (rate_clk && gpu->fast_rate)
+		clk_set_rate(rate_clk, gpu->fast_rate);
+
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+		if (gpu->grp_clks[i])
+			clk_enable(gpu->grp_clks[i]);
+
+	return 0;
+}
+
+static int disable_clk(struct msm_gpu *gpu)
+{
+	struct clk *rate_clk = NULL;
+	int i;
+
+	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) {
+		if (gpu->grp_clks[i]) {
+			clk_disable(gpu->grp_clks[i]);
+			rate_clk = gpu->grp_clks[i];
+		}
+	}
+
+	if (rate_clk && gpu->slow_rate)
+		clk_set_rate(rate_clk, gpu->slow_rate);
+
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+		if (gpu->grp_clks[i])
+			clk_unprepare(gpu->grp_clks[i]);
+
+	return 0;
+}
+
+static int enable_axi(struct msm_gpu *gpu)
+{
+	if (gpu->ebi1_clk)
+		clk_prepare_enable(gpu->ebi1_clk);
+	if (gpu->bus_freq)
+		bs_set(gpu, gpu->bus_freq);
+	return 0;
+}
+
+static int disable_axi(struct msm_gpu *gpu)
+{
+	if (gpu->ebi1_clk)
+		clk_disable_unprepare(gpu->ebi1_clk);
+	if (gpu->bus_freq)
+		bs_set(gpu, 0);
+	return 0;
+}
+
+int msm_gpu_pm_resume(struct msm_gpu *gpu)
+{
+	int ret;
+
+	DBG("%s", gpu->name);
+
+	ret = enable_pwrrail(gpu);
+	if (ret)
+		return ret;
+
+	ret = enable_clk(gpu);
+	if (ret)
+		return ret;
+
+	ret = enable_axi(gpu);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int msm_gpu_pm_suspend(struct msm_gpu *gpu)
+{
+	int ret;
+
+	DBG("%s", gpu->name);
+
+	ret = disable_axi(gpu);
+	if (ret)
+		return ret;
+
+	ret = disable_clk(gpu);
+	if (ret)
+		return ret;
+
+	ret = disable_pwrrail(gpu);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+/*
+ * Cmdstream submission/retirement:
+ */
+
+static void retire_worker(struct work_struct *work)
+{
+	struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work);
+	struct drm_device *dev = gpu->dev;
+	uint32_t fence = gpu->funcs->last_fence(gpu);
+
+	mutex_lock(&dev->struct_mutex);
+
+	while (!list_empty(&gpu->active_list)) {
+		struct msm_gem_object *obj;
+
+		obj = list_first_entry(&gpu->active_list,
+				struct msm_gem_object, mm_list);
+
+		if (obj->fence <= fence) {
+			/* move to inactive: */
+			msm_gem_move_to_inactive(&obj->base);
+			msm_gem_put_iova(&obj->base, gpu->id);
+			drm_gem_object_unreference(&obj->base);
+		} else {
+			break;
+		}
+	}
+
+	msm_update_fence(gpu->dev, fence);
+
+	mutex_unlock(&dev->struct_mutex);
+}
+
+/* call from irq handler to schedule work to retire bo's */
+void msm_gpu_retire(struct msm_gpu *gpu)
+{
+	struct msm_drm_private *priv = gpu->dev->dev_private;
+	queue_work(priv->wq, &gpu->retire_work);
+}
+
+/* add bo's to gpu's ring, and kick gpu: */
+int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+		struct msm_file_private *ctx)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	int i, ret;
+
+	mutex_lock(&dev->struct_mutex);
+
+	submit->fence = ++priv->next_fence;
+
+	ret = gpu->funcs->submit(gpu, submit, ctx);
+	priv->lastctx = ctx;
+
+	for (i = 0; i < submit->nr_bos; i++) {
+		struct msm_gem_object *msm_obj = submit->bos[i].obj;
+
+		/* can't happen yet.. but when we add 2d support we'll have
+		 * to deal w/ cross-ring synchronization:
+		 */
+		WARN_ON(is_active(msm_obj) && (msm_obj->gpu != gpu));
+
+		if (!is_active(msm_obj)) {
+			uint32_t iova;
+
+			/* ring takes a reference to the bo and iova: */
+			drm_gem_object_reference(&msm_obj->base);
+			msm_gem_get_iova_locked(&msm_obj->base,
+					submit->gpu->id, &iova);
+		}
+
+		msm_gem_move_to_active(&msm_obj->base, gpu, submit->fence);
+	}
+	mutex_unlock(&dev->struct_mutex);
+
+	return ret;
+}
+
+/*
+ * Init/Cleanup:
+ */
+
+static irqreturn_t irq_handler(int irq, void *data)
+{
+	struct msm_gpu *gpu = data;
+	return gpu->funcs->irq(gpu);
+}
+
+static const char *clk_names[] = {
+		"src_clk", "core_clk", "iface_clk", "mem_clk", "mem_iface_clk",
+};
+
+int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
+		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
+		const char *name, const char *ioname, const char *irqname, int ringsz)
+{
+	int i, ret;
+
+	gpu->dev = drm;
+	gpu->funcs = funcs;
+	gpu->name = name;
+
+	INIT_LIST_HEAD(&gpu->active_list);
+	INIT_WORK(&gpu->retire_work, retire_worker);
+
+	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
+
+	/* Map registers: */
+	gpu->mmio = msm_ioremap(pdev, ioname, name);
+	if (IS_ERR(gpu->mmio)) {
+		ret = PTR_ERR(gpu->mmio);
+		goto fail;
+	}
+
+	/* Get Interrupt: */
+	gpu->irq = platform_get_irq_byname(pdev, irqname);
+	if (gpu->irq < 0) {
+		ret = gpu->irq;
+		dev_err(drm->dev, "failed to get irq: %d\n", ret);
+		goto fail;
+	}
+
+	ret = devm_request_irq(&pdev->dev, gpu->irq, irq_handler,
+			IRQF_TRIGGER_HIGH, gpu->name, gpu);
+	if (ret) {
+		dev_err(drm->dev, "failed to request IRQ%u: %d\n", gpu->irq, ret);
+		goto fail;
+	}
+
+	/* Acquire clocks: */
+	for (i = 0; i < ARRAY_SIZE(clk_names); i++) {
+		gpu->grp_clks[i] = devm_clk_get(&pdev->dev, clk_names[i]);
+		DBG("grp_clks[%s]: %p", clk_names[i], gpu->grp_clks[i]);
+		if (IS_ERR(gpu->grp_clks[i]))
+			gpu->grp_clks[i] = NULL;
+	}
+
+	gpu->ebi1_clk = devm_clk_get(&pdev->dev, "bus_clk");
+	DBG("ebi1_clk: %p", gpu->ebi1_clk);
+	if (IS_ERR(gpu->ebi1_clk))
+		gpu->ebi1_clk = NULL;
+
+	/* Acquire regulators: */
+	gpu->gpu_reg = devm_regulator_get(&pdev->dev, "vdd");
+	DBG("gpu_reg: %p", gpu->gpu_reg);
+	if (IS_ERR(gpu->gpu_reg))
+		gpu->gpu_reg = NULL;
+
+	gpu->gpu_cx = devm_regulator_get(&pdev->dev, "vddcx");
+	DBG("gpu_cx: %p", gpu->gpu_cx);
+	if (IS_ERR(gpu->gpu_cx))
+		gpu->gpu_cx = NULL;
+
+	/* Setup IOMMU.. eventually we will (I think) do this once per context
+	 * and have separate page tables per context.  For now, to keep things
+	 * simple and to get something working, just use a single address space:
+	 */
+	gpu->iommu = iommu_domain_alloc(&platform_bus_type);
+	if (!gpu->iommu) {
+		dev_err(drm->dev, "failed to allocate IOMMU\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	gpu->id = msm_register_iommu(drm, gpu->iommu);
+
+	/* Create ringbuffer: */
+	gpu->rb = msm_ringbuffer_new(gpu, ringsz);
+	if (IS_ERR(gpu->rb)) {
+		ret = PTR_ERR(gpu->rb);
+		gpu->rb = NULL;
+		dev_err(drm->dev, "could not create ringbuffer: %d\n", ret);
+		goto fail;
+	}
+
+	ret = msm_gem_get_iova_locked(gpu->rb->bo, gpu->id, &gpu->rb_iova);
+	if (ret) {
+		gpu->rb_iova = 0;
+		dev_err(drm->dev, "could not map ringbuffer: %d\n", ret);
+		goto fail;
+	}
+
+	bs_init(gpu, pdev);
+
+	return 0;
+
+fail:
+	return ret;
+}
+
+void msm_gpu_cleanup(struct msm_gpu *gpu)
+{
+	DBG("%s", gpu->name);
+
+	WARN_ON(!list_empty(&gpu->active_list));
+
+	bs_fini(gpu);
+
+	if (gpu->rb) {
+		if (gpu->rb_iova)
+			msm_gem_put_iova(gpu->rb->bo, gpu->id);
+		msm_ringbuffer_destroy(gpu->rb);
+	}
+
+	if (gpu->iommu)
+		iommu_domain_free(gpu->iommu);
+}
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
new file mode 100644
index 0000000..8d2cd6c
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MSM_GPU_H__
+#define __MSM_GPU_H__
+
+#include <linux/clk.h>
+#include <linux/regulator/consumer.h>
+
+#include "msm_drv.h"
+#include "msm_ringbuffer.h"
+
+struct msm_gem_submit;
+
+/* So far, with hardware that I've seen to date, we can have:
+ *  + zero, one, or two z180 2d cores
+ *  + a3xx or a2xx 3d core, which share a common CP (the firmware
+ *    for the CP seems to implement some different PM4 packet types
+ *    but the basics of cmdstream submission are the same)
+ *
+ * Which means that the eventual complete "class" hierarchy, once
+ * support for all past and present hw is in place, becomes:
+ *  + msm_gpu
+ *    + adreno_gpu
+ *      + a3xx_gpu
+ *      + a2xx_gpu
+ *    + z180_gpu
+ */
+struct msm_gpu_funcs {
+	int (*get_param)(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
+	int (*hw_init)(struct msm_gpu *gpu);
+	int (*pm_suspend)(struct msm_gpu *gpu);
+	int (*pm_resume)(struct msm_gpu *gpu);
+	int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+			struct msm_file_private *ctx);
+	void (*flush)(struct msm_gpu *gpu);
+	void (*idle)(struct msm_gpu *gpu);
+	irqreturn_t (*irq)(struct msm_gpu *irq);
+	uint32_t (*last_fence)(struct msm_gpu *gpu);
+	void (*destroy)(struct msm_gpu *gpu);
+#ifdef CONFIG_DEBUG_FS
+	/* show GPU status in debugfs: */
+	void (*show)(struct msm_gpu *gpu, struct seq_file *m);
+#endif
+};
+
+struct msm_gpu {
+	const char *name;
+	struct drm_device *dev;
+	const struct msm_gpu_funcs *funcs;
+
+	struct msm_ringbuffer *rb;
+	uint32_t rb_iova;
+
+	/* list of GEM active objects: */
+	struct list_head active_list;
+
+	/* worker for handling active-list retiring: */
+	struct work_struct retire_work;
+
+	void __iomem *mmio;
+	int irq;
+
+	struct iommu_domain *iommu;
+	int id;
+
+	/* Power Control: */
+	struct regulator *gpu_reg, *gpu_cx;
+	struct clk *ebi1_clk, *grp_clks[5];
+	uint32_t fast_rate, slow_rate, bus_freq;
+	uint32_t bsc;
+};
+
+static inline void gpu_write(struct msm_gpu *gpu, u32 reg, u32 data)
+{
+	msm_writel(data, gpu->mmio + (reg << 2));
+}
+
+static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
+{
+	return msm_readl(gpu->mmio + (reg << 2));
+}
+
+int msm_gpu_pm_suspend(struct msm_gpu *gpu);
+int msm_gpu_pm_resume(struct msm_gpu *gpu);
+
+void msm_gpu_retire(struct msm_gpu *gpu);
+int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+		struct msm_file_private *ctx);
+
+int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
+		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
+		const char *name, const char *ioname, const char *irqname, int ringsz);
+void msm_gpu_cleanup(struct msm_gpu *gpu);
+
+struct msm_gpu *a3xx_gpu_init(struct drm_device *dev);
+void __init a3xx_register(void);
+void __exit a3xx_unregister(void);
+
+#endif /* __MSM_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
new file mode 100644
index 0000000..8171537d
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msm_ringbuffer.h"
+#include "msm_gpu.h"
+
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
+{
+	struct msm_ringbuffer *ring;
+	int ret;
+
+	size = ALIGN(size, 4);   /* size should be dword aligned */
+
+	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
+	if (!ring) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	ring->gpu = gpu;
+	ring->bo = msm_gem_new(gpu->dev, size, MSM_BO_WC);
+	if (IS_ERR(ring->bo)) {
+		ret = PTR_ERR(ring->bo);
+		ring->bo = NULL;
+		goto fail;
+	}
+
+	ring->start = msm_gem_vaddr_locked(ring->bo);
+	ring->end   = ring->start + (size / 4);
+	ring->cur   = ring->start;
+
+	ring->size = size;
+
+	return ring;
+
+fail:
+	if (ring)
+		msm_ringbuffer_destroy(ring);
+	return ERR_PTR(ret);
+}
+
+void msm_ringbuffer_destroy(struct msm_ringbuffer *ring)
+{
+	if (ring->bo)
+		drm_gem_object_unreference(ring->bo);
+	kfree(ring);
+}
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
new file mode 100644
index 0000000..6e0e104
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MSM_RINGBUFFER_H__
+#define __MSM_RINGBUFFER_H__
+
+#include "msm_drv.h"
+
+struct msm_ringbuffer {
+	struct msm_gpu *gpu;
+	int size;
+	struct drm_gem_object *bo;
+	uint32_t *start, *end, *cur;
+};
+
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size);
+void msm_ringbuffer_destroy(struct msm_ringbuffer *ring);
+
+/* ringbuffer helpers (the parts that are same for a3xx/a2xx/z180..) */
+
+static inline void
+OUT_RING(struct msm_ringbuffer *ring, uint32_t data)
+{
+	if (ring->cur == ring->end)
+		ring->cur = ring->start;
+	*(ring->cur++) = data;
+}
+
+#endif /* __MSM_RINGBUFFER_H__ */
diff --git a/include/uapi/drm/Kbuild b/include/uapi/drm/Kbuild
index 119487e..2d9a25d 100644
--- a/include/uapi/drm/Kbuild
+++ b/include/uapi/drm/Kbuild
@@ -16,3 +16,4 @@
 header-y += tegra_drm.h
 header-y += via_drm.h
 header-y += vmwgfx_drm.h
+header-y += msm_drm.h
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
new file mode 100644
index 0000000..d3c6207
--- /dev/null
+++ b/include/uapi/drm/msm_drm.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MSM_DRM_H__
+#define __MSM_DRM_H__
+
+#include <stddef.h>
+#include <drm/drm.h>
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints:
+ *  1) Do not use pointers, use uint64_t instead for 32 bit / 64 bit
+ *     user/kernel compatibility
+ *  2) Keep fields aligned to their size
+ *  3) Because of how drm_ioctl() works, we can add new fields at
+ *     the end of an ioctl if some care is taken: drm_ioctl() will
+ *     zero out the new fields at the tail of the ioctl, so a zero
+ *     value should have a backwards compatible meaning.  And for
+ *     output params, userspace won't see the newly added output
+ *     fields.. so that has to be somehow ok.
+ */
+
+#define MSM_PIPE_NONE        0x00
+#define MSM_PIPE_2D0         0x01
+#define MSM_PIPE_2D1         0x02
+#define MSM_PIPE_3D0         0x10
+
+/* timeouts are specified in clock-monotonic absolute times (to simplify
+ * restarting interrupted ioctls).  The following struct is logically the
+ * same as 'struct timespec' but 32/64b ABI safe.
+ */
+struct drm_msm_timespec {
+	int64_t tv_sec;          /* seconds */
+	int64_t tv_nsec;         /* nanoseconds */
+};
+
+#define MSM_PARAM_GPU_ID     0x01
+#define MSM_PARAM_GMEM_SIZE  0x02
+
+struct drm_msm_param {
+	uint32_t pipe;           /* in, MSM_PIPE_x */
+	uint32_t param;          /* in, MSM_PARAM_x */
+	uint64_t value;          /* out (get_param) or in (set_param) */
+};
+
+/*
+ * GEM buffers:
+ */
+
+#define MSM_BO_SCANOUT       0x00000001     /* scanout capable */
+#define MSM_BO_GPU_READONLY  0x00000002
+#define MSM_BO_CACHE_MASK    0x000f0000
+/* cache modes */
+#define MSM_BO_CACHED        0x00010000
+#define MSM_BO_WC            0x00020000
+#define MSM_BO_UNCACHED      0x00040000
+
+struct drm_msm_gem_new {
+	uint64_t size;           /* in */
+	uint32_t flags;          /* in, mask of MSM_BO_x */
+	uint32_t handle;         /* out */
+};
+
+struct drm_msm_gem_info {
+	uint32_t handle;         /* in */
+	uint32_t pad;
+	uint64_t offset;         /* out, offset to pass to mmap() */
+};
+
+#define MSM_PREP_READ        0x01
+#define MSM_PREP_WRITE       0x02
+#define MSM_PREP_NOSYNC      0x04
+
+struct drm_msm_gem_cpu_prep {
+	uint32_t handle;         /* in */
+	uint32_t op;             /* in, mask of MSM_PREP_x */
+	struct drm_msm_timespec timeout;   /* in */
+};
+
+struct drm_msm_gem_cpu_fini {
+	uint32_t handle;         /* in */
+};
+
+/*
+ * Cmdstream Submission:
+ */
+
+/* The value written into the cmdstream is logically:
+ *
+ *   ((relocbuf->gpuaddr + reloc_offset) << shift) | or
+ *
+ * When we have GPU's w/ >32bit ptrs, it should be possible to deal
+ * with this by emit'ing two reloc entries with appropriate shift
+ * values.  Or a new MSM_SUBMIT_CMD_x type would also be an option.
+ *
+ * NOTE that reloc's must be sorted by order of increasing submit_offset,
+ * otherwise EINVAL.
+ */
+struct drm_msm_gem_submit_reloc {
+	uint32_t submit_offset;  /* in, offset from submit_bo */
+	uint32_t or;             /* in, value OR'd with result */
+	int32_t  shift;          /* in, amount of left shift (can be negative) */
+	uint32_t reloc_idx;      /* in, index of reloc_bo buffer */
+	uint64_t reloc_offset;   /* in, offset from start of reloc_bo */
+};
+
+/* submit-types:
+ *   BUF - this cmd buffer is executed normally.
+ *   IB_TARGET_BUF - this cmd buffer is an IB target.  Reloc's are
+ *      processed normally, but the kernel does not setup an IB to
+ *      this buffer in the first-level ringbuffer
+ *   CTX_RESTORE_BUF - only executed if there has been a GPU context
+ *      switch since the last SUBMIT ioctl
+ */
+#define MSM_SUBMIT_CMD_BUF             0x0001
+#define MSM_SUBMIT_CMD_IB_TARGET_BUF   0x0002
+#define MSM_SUBMIT_CMD_CTX_RESTORE_BUF 0x0003
+struct drm_msm_gem_submit_cmd {
+	uint32_t type;           /* in, one of MSM_SUBMIT_CMD_x */
+	uint32_t submit_idx;     /* in, index of submit_bo cmdstream buffer */
+	uint32_t submit_offset;  /* in, offset into submit_bo */
+	uint32_t size;           /* in, cmdstream size */
+	uint32_t pad;
+	uint32_t nr_relocs;      /* in, number of submit_reloc's */
+	uint64_t __user relocs;  /* in, ptr to array of submit_reloc's */
+};
+
+/* Each buffer referenced elsewhere in the cmdstream submit (ie. the
+ * cmdstream buffer(s) themselves or reloc entries) has one (and only
+ * one) entry in the submit->bos[] table.
+ *
+ * As a optimization, the current buffer (gpu virtual address) can be
+ * passed back through the 'presumed' field.  If on a subsequent reloc,
+ * userspace passes back a 'presumed' address that is still valid,
+ * then patching the cmdstream for this entry is skipped.  This can
+ * avoid kernel needing to map/access the cmdstream bo in the common
+ * case.
+ */
+#define MSM_SUBMIT_BO_READ             0x0001
+#define MSM_SUBMIT_BO_WRITE            0x0002
+struct drm_msm_gem_submit_bo {
+	uint32_t flags;          /* in, mask of MSM_SUBMIT_BO_x */
+	uint32_t handle;         /* in, GEM handle */
+	uint64_t presumed;       /* in/out, presumed buffer address */
+};
+
+/* Each cmdstream submit consists of a table of buffers involved, and
+ * one or more cmdstream buffers.  This allows for conditional execution
+ * (context-restore), and IB buffers needed for per tile/bin draw cmds.
+ */
+struct drm_msm_gem_submit {
+	uint32_t pipe;           /* in, MSM_PIPE_x */
+	uint32_t fence;          /* out */
+	uint32_t nr_bos;         /* in, number of submit_bo's */
+	uint32_t nr_cmds;        /* in, number of submit_cmd's */
+	uint64_t __user bos;     /* in, ptr to array of submit_bo's */
+	uint64_t __user cmds;    /* in, ptr to array of submit_cmd's */
+};
+
+/* The normal way to synchronize with the GPU is just to CPU_PREP on
+ * a buffer if you need to access it from the CPU (other cmdstream
+ * submission from same or other contexts, PAGE_FLIP ioctl, etc, all
+ * handle the required synchronization under the hood).  This ioctl
+ * mainly just exists as a way to implement the gallium pipe_fence
+ * APIs without requiring a dummy bo to synchronize on.
+ */
+struct drm_msm_wait_fence {
+	uint32_t fence;          /* in */
+	uint32_t pad;
+	struct drm_msm_timespec timeout;   /* in */
+};
+
+#define DRM_MSM_GET_PARAM              0x00
+/* placeholder:
+#define DRM_MSM_SET_PARAM              0x01
+ */
+#define DRM_MSM_GEM_NEW                0x02
+#define DRM_MSM_GEM_INFO               0x03
+#define DRM_MSM_GEM_CPU_PREP           0x04
+#define DRM_MSM_GEM_CPU_FINI           0x05
+#define DRM_MSM_GEM_SUBMIT             0x06
+#define DRM_MSM_WAIT_FENCE             0x07
+#define DRM_MSM_NUM_IOCTLS             0x08
+
+#define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
+#define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
+#define DRM_IOCTL_MSM_GEM_INFO         DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_INFO, struct drm_msm_gem_info)
+#define DRM_IOCTL_MSM_GEM_CPU_PREP     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_CPU_PREP, struct drm_msm_gem_cpu_prep)
+#define DRM_IOCTL_MSM_GEM_CPU_FINI     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_CPU_FINI, struct drm_msm_gem_cpu_fini)
+#define DRM_IOCTL_MSM_GEM_SUBMIT       DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_SUBMIT, struct drm_msm_gem_submit)
+#define DRM_IOCTL_MSM_WAIT_FENCE       DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_WAIT_FENCE, struct drm_msm_wait_fence)
+
+#endif /* __MSM_DRM_H__ */