Merge "arm64: defconfig: Enable G-Link" into msm-4.8
diff --git a/arch/arm64/include/asm/dma-iommu.h b/arch/arm64/include/asm/dma-iommu.h
index df03ebc..ab0e5b2 100644
--- a/arch/arm64/include/asm/dma-iommu.h
+++ b/arch/arm64/include/asm/dma-iommu.h
@@ -9,6 +9,7 @@
 #include <linux/dma-debug.h>
 #include <linux/kmemcheck.h>
 #include <linux/kref.h>
+#include <linux/dma-mapping-fast.h>
 
 struct dma_iommu_mapping {
 	/* iommu specific data */
@@ -20,6 +21,8 @@
 
 	spinlock_t		lock;
 	struct kref		kref;
+
+	struct dma_fast_smmu_mapping *fast;
 };
 
 #ifdef CONFIG_ARM64_DMA_USE_IOMMU
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index c065f54..6160c66 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -34,6 +34,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/dma-iommu.h>
+#include <linux/dma-mapping-fast.h>
 
 #include "mm.h"
 
@@ -2074,7 +2075,11 @@
 			    struct dma_iommu_mapping *mapping)
 {
 	int err;
-	int s1_bypass = 0;
+	int s1_bypass = 0, is_fast = 0;
+
+	iommu_domain_get_attr(mapping->domain, DOMAIN_ATTR_FAST, &is_fast);
+	if (is_fast)
+		return fast_smmu_attach_device(dev, mapping);
 
 	err = iommu_attach_device(mapping->domain, dev);
 	if (err)
@@ -2103,6 +2108,7 @@
 void arm_iommu_detach_device(struct device *dev)
 {
 	struct dma_iommu_mapping *mapping;
+	int is_fast;
 
 	mapping = to_dma_iommu_mapping(dev);
 	if (!mapping) {
@@ -2110,6 +2116,12 @@
 		return;
 	}
 
+	iommu_domain_get_attr(mapping->domain, DOMAIN_ATTR_FAST, &is_fast);
+	if (is_fast) {
+		fast_smmu_detach_device(dev, mapping);
+		return;
+	}
+
 	iommu_detach_device(mapping->domain, dev);
 	kref_put(&mapping->kref, release_iommu_mapping);
 	dev->archdata.mapping = NULL;
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index fae9033..37cb37f 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -6,7 +6,7 @@
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
-obj-$(CONFIG_IOMMU_IO_PGTABLE_FAST) += io-pgtable-fast.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_FAST) += io-pgtable-fast.o dma-mapping-fast.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_IOMMU_DEBUG) += iommu-debug.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e7caeb0..e95744f 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -2572,6 +2572,17 @@
 		*((int *)data) = smmu_domain->secure_vmid;
 		ret = 0;
 		break;
+	case DOMAIN_ATTR_PGTBL_INFO: {
+		struct iommu_pgtbl_info *info = data;
+
+		if (!(smmu_domain->attributes & (1 << DOMAIN_ATTR_FAST))) {
+			ret = -ENODEV;
+			break;
+		}
+		info->pmds = smmu_domain->pgtbl_cfg.av8l_fast_cfg.pmds;
+		ret = 0;
+		break;
+	}
 	default:
 		return -ENODEV;
 	}
@@ -2794,6 +2805,20 @@
 	arm_smmu_tlb_inv_context(to_smmu_domain(domain));
 }
 
+static int arm_smmu_enable_config_clocks(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	return arm_smmu_power_on(smmu_domain->smmu);
+}
+
+static void arm_smmu_disable_config_clocks(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	arm_smmu_power_off(smmu_domain->smmu);
+}
+
 static struct iommu_ops arm_smmu_ops = {
 	.capable		= arm_smmu_capable,
 	.domain_alloc		= arm_smmu_domain_alloc,
@@ -2815,6 +2840,8 @@
 	.reg_read		= arm_smmu_reg_read,
 	.reg_write		= arm_smmu_reg_write,
 	.tlbi_domain		= arm_smmu_tlbi_domain,
+	.enable_config_clocks	= arm_smmu_enable_config_clocks,
+	.disable_config_clocks	= arm_smmu_disable_config_clocks,
 };
 
 static int arm_smmu_wait_for_halt(struct arm_smmu_device *smmu)
diff --git a/drivers/iommu/dma-mapping-fast.c b/drivers/iommu/dma-mapping-fast.c
new file mode 100644
index 0000000..2d36ee3
--- /dev/null
+++ b/drivers/iommu/dma-mapping-fast.c
@@ -0,0 +1,645 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/dma-contiguous.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-mapping-fast.h>
+#include <linux/io-pgtable-fast.h>
+#include <asm/cacheflush.h>
+#include <asm/dma-iommu.h>
+
+
+/* some redundant definitions... :( TODO: move to io-pgtable-fast.h */
+#define FAST_PAGE_SHIFT		12
+#define FAST_PAGE_SIZE (1UL << FAST_PAGE_SHIFT)
+#define FAST_PAGE_MASK (~(PAGE_SIZE - 1))
+#define FAST_PTE_ADDR_MASK		((av8l_fast_iopte)0xfffffffff000)
+
+/*
+ * Checks if the allocated range (ending at @end) covered the upcoming
+ * stale bit.  We don't need to know exactly where the range starts since
+ * we already know where the candidate search range started.  If, starting
+ * from the beginning of the candidate search range, we had to step over
+ * (or landed directly on top of) the upcoming stale bit, then we return
+ * true.
+ *
+ * Due to wrapping, there are two scenarios we'll need to check: (1) if the
+ * range [search_start, upcoming_stale] spans 0 (i.e. search_start >
+ * upcoming_stale), and, (2) if the range: [search_start, upcoming_stale]
+ * does *not* span 0 (i.e. search_start <= upcoming_stale).  And for each
+ * of those two scenarios we need to handle three cases: (1) the bit was
+ * found before wrapping or
+ */
+static bool __bit_covered_stale(unsigned long upcoming_stale,
+				unsigned long search_start,
+				unsigned long end)
+{
+	if (search_start > upcoming_stale) {
+		if (end >= search_start) {
+			/*
+			 * We started searching above upcoming_stale and we
+			 * didn't wrap, so we couldn't have crossed
+			 * upcoming_stale.
+			 */
+			return false;
+		}
+		/*
+		 * We wrapped. Did we cross (or land on top of)
+		 * upcoming_stale?
+		 */
+		return end >= upcoming_stale;
+	}
+
+	if (search_start <= upcoming_stale) {
+		if (end >= search_start) {
+			/*
+			 * We didn't wrap.  Did we cross (or land on top
+			 * of) upcoming_stale?
+			 */
+			return end >= upcoming_stale;
+		}
+		/*
+		 * We wrapped. So we must have crossed upcoming_stale
+		 * (since we started searching below it).
+		 */
+		return true;
+	}
+
+	/* we should have covered all logical combinations... */
+	WARN_ON(1);
+	return true;
+}
+
+static dma_addr_t __fast_smmu_alloc_iova(struct dma_fast_smmu_mapping *mapping,
+					 size_t size)
+{
+	unsigned long bit, prev_search_start, nbits = size >> FAST_PAGE_SHIFT;
+	unsigned long align = (1 << get_order(size)) - 1;
+
+	bit = bitmap_find_next_zero_area(
+		mapping->bitmap, mapping->num_4k_pages, mapping->next_start,
+		nbits, align);
+	if (unlikely(bit > mapping->num_4k_pages)) {
+		/* try wrapping */
+		mapping->next_start = 0; /* TODO: SHOULD I REALLY DO THIS?!? */
+		bit = bitmap_find_next_zero_area(
+			mapping->bitmap, mapping->num_4k_pages, 0, nbits,
+			align);
+		if (unlikely(bit > mapping->num_4k_pages))
+			return DMA_ERROR_CODE;
+	}
+
+	bitmap_set(mapping->bitmap, bit, nbits);
+	prev_search_start = mapping->next_start;
+	mapping->next_start = bit + nbits;
+	if (unlikely(mapping->next_start >= mapping->num_4k_pages))
+		mapping->next_start = 0;
+
+	/*
+	 * If we just re-allocated a VA whose TLB hasn't been invalidated
+	 * since it was last used and unmapped, we need to invalidate it
+	 * here.  We actually invalidate the entire TLB so that we don't
+	 * have to invalidate the TLB again until we wrap back around.
+	 */
+	if (mapping->have_stale_tlbs &&
+	    __bit_covered_stale(mapping->upcoming_stale_bit,
+				prev_search_start,
+				bit + nbits - 1)) {
+		iommu_tlbiall(mapping->domain);
+		mapping->have_stale_tlbs = false;
+	}
+
+	return (bit << FAST_PAGE_SHIFT) + mapping->base;
+}
+
+/*
+ * Checks whether the candidate bit will be allocated sooner than the
+ * current upcoming stale bit.  We can say candidate will be upcoming
+ * sooner than the current upcoming stale bit if it lies between the
+ * starting bit of the next search range and the upcoming stale bit
+ * (allowing for wrap-around).
+ *
+ * Stated differently, we're checking the relative ordering of three
+ * unsigned numbers.  So we need to check all 6 (i.e. 3!) permutations,
+ * namely:
+ *
+ *     0 |---A---B---C---| TOP (Case 1)
+ *     0 |---A---C---B---| TOP (Case 2)
+ *     0 |---B---A---C---| TOP (Case 3)
+ *     0 |---B---C---A---| TOP (Case 4)
+ *     0 |---C---A---B---| TOP (Case 5)
+ *     0 |---C---B---A---| TOP (Case 6)
+ *
+ * Note that since we're allowing numbers to wrap, the following three
+ * scenarios are all equivalent for Case 1:
+ *
+ *     0 |---A---B---C---| TOP
+ *     0 |---C---A---B---| TOP (C has wrapped. This is Case 5.)
+ *     0 |---B---C---A---| TOP (C and B have wrapped. This is Case 4.)
+ *
+ * In any of these cases, if we start searching from A, we will find B
+ * before we find C.
+ *
+ * We can also find two equivalent cases for Case 2:
+ *
+ *     0 |---A---C---B---| TOP
+ *     0 |---B---A---C---| TOP (B has wrapped. This is Case 3.)
+ *     0 |---C---B---A---| TOP (B and C have wrapped. This is Case 6.)
+ *
+ * In any of these cases, if we start searching from A, we will find C
+ * before we find B.
+ */
+static bool __bit_is_sooner(unsigned long candidate,
+			    struct dma_fast_smmu_mapping *mapping)
+{
+	unsigned long A = mapping->next_start;
+	unsigned long B = candidate;
+	unsigned long C = mapping->upcoming_stale_bit;
+
+	if ((A < B && B < C) ||	/* Case 1 */
+	    (C < A && A < B) ||	/* Case 5 */
+	    (B < C && C < A))	/* Case 4 */
+		return true;
+
+	if ((A < C && C < B) ||	/* Case 2 */
+	    (B < A && A < C) ||	/* Case 3 */
+	    (C < B && B < A))	/* Case 6 */
+		return false;
+
+	/*
+	 * For simplicity, we've been ignoring the possibility of any of
+	 * our three numbers being equal.  Handle those cases here (they
+	 * shouldn't happen very often, (I think?)).
+	 */
+
+	/*
+	 * If candidate is the next bit to be searched then it's definitely
+	 * sooner.
+	 */
+	if (A == B)
+		return true;
+
+	/*
+	 * If candidate is the next upcoming stale bit we'll return false
+	 * to avoid doing `upcoming = candidate' in the caller (which would
+	 * be useless since they're already equal)
+	 */
+	if (B == C)
+		return false;
+
+	/*
+	 * If next start is the upcoming stale bit then candidate can't
+	 * possibly be sooner.  The "soonest" bit is already selected.
+	 */
+	if (A == C)
+		return false;
+
+	/* We should have covered all logical combinations. */
+	WARN(1, "Well, that's awkward. A=%ld, B=%ld, C=%ld\n", A, B, C);
+	return true;
+}
+
+static void __fast_smmu_free_iova(struct dma_fast_smmu_mapping *mapping,
+				  dma_addr_t iova, size_t size)
+{
+	unsigned long start_bit = (iova - mapping->base) >> FAST_PAGE_SHIFT;
+	unsigned long nbits = size >> FAST_PAGE_SHIFT;
+
+	/*
+	 * We don't invalidate TLBs on unmap.  We invalidate TLBs on map
+	 * when we're about to re-allocate a VA that was previously
+	 * unmapped but hasn't yet been invalidated.  So we need to keep
+	 * track of which bit is the closest to being re-allocated here.
+	 */
+	if (__bit_is_sooner(start_bit, mapping))
+		mapping->upcoming_stale_bit = start_bit;
+
+	bitmap_clear(mapping->bitmap, start_bit, nbits);
+	mapping->have_stale_tlbs = true;
+}
+
+
+static void __fast_dma_page_cpu_to_dev(struct page *page, unsigned long off,
+				       size_t size, enum dma_data_direction dir)
+{
+	__dma_map_area(page_address(page) + off, size, dir);
+}
+
+static void __fast_dma_page_dev_to_cpu(struct page *page, unsigned long off,
+				       size_t size, enum dma_data_direction dir)
+{
+	__dma_unmap_area(page_address(page) + off, size, dir);
+
+	/* TODO: WHAT IS THIS? */
+	/*
+	 * Mark the D-cache clean for this page to avoid extra flushing.
+	 */
+	if (dir != DMA_TO_DEVICE && off == 0 && size >= PAGE_SIZE)
+		set_bit(PG_dcache_clean, &page->flags);
+}
+
+static int __fast_dma_direction_to_prot(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return IOMMU_READ | IOMMU_WRITE;
+	case DMA_TO_DEVICE:
+		return IOMMU_READ;
+	case DMA_FROM_DEVICE:
+		return IOMMU_WRITE;
+	default:
+		return 0;
+	}
+}
+
+static dma_addr_t fast_smmu_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev->archdata.mapping->fast;
+	dma_addr_t iova;
+	unsigned long flags;
+	av8l_fast_iopte *pmd;
+	phys_addr_t phys_plus_off = page_to_phys(page) + offset;
+	phys_addr_t phys_to_map = round_down(phys_plus_off, FAST_PAGE_SIZE);
+	unsigned long offset_from_phys_to_map = phys_plus_off & ~FAST_PAGE_MASK;
+	size_t len = ALIGN(size + offset_from_phys_to_map, FAST_PAGE_SIZE);
+	int nptes = len >> FAST_PAGE_SHIFT;
+	bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
+	int prot = __fast_dma_direction_to_prot(dir);
+
+	if (attrs & DMA_ATTR_STRONGLY_ORDERED)
+		prot |= IOMMU_MMIO;
+
+	if (!skip_sync)
+		__fast_dma_page_cpu_to_dev(phys_to_page(phys_to_map),
+					   offset_from_phys_to_map, size, dir);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+
+	iova = __fast_smmu_alloc_iova(mapping, len);
+
+	if (unlikely(iova == DMA_ERROR_CODE))
+		goto fail;
+
+	pmd = iopte_pmd_offset(mapping->pgtbl_pmds, iova);
+
+	if (unlikely(av8l_fast_map_public(pmd, phys_to_map, len, prot)))
+		goto fail_free_iova;
+
+	if (!skip_sync)		/* TODO: should ask SMMU if coherent */
+		dmac_clean_range(pmd, pmd + nptes);
+
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	return iova + offset_from_phys_to_map;
+
+fail_free_iova:
+	__fast_smmu_free_iova(mapping, iova, size);
+fail:
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	return DMA_ERROR_CODE;
+}
+
+static void fast_smmu_unmap_page(struct device *dev, dma_addr_t iova,
+			       size_t size, enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev->archdata.mapping->fast;
+	unsigned long flags;
+	av8l_fast_iopte *pmd = iopte_pmd_offset(mapping->pgtbl_pmds, iova);
+	unsigned long offset = iova & ~FAST_PAGE_MASK;
+	size_t len = ALIGN(size + offset, FAST_PAGE_SIZE);
+	int nptes = len >> FAST_PAGE_SHIFT;
+	struct page *page = phys_to_page((*pmd & FAST_PTE_ADDR_MASK));
+	bool skip_sync = (attrs & DMA_ATTR_SKIP_CPU_SYNC);
+
+	if (!skip_sync)
+		__fast_dma_page_dev_to_cpu(page, offset, size, dir);
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	av8l_fast_unmap_public(pmd, len);
+	if (!skip_sync)		/* TODO: should ask SMMU if coherent */
+		dmac_clean_range(pmd, pmd + nptes);
+	__fast_smmu_free_iova(mapping, iova, len);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+}
+
+static int fast_smmu_map_sg(struct device *dev, struct scatterlist *sg,
+			    int nents, enum dma_data_direction dir,
+			    unsigned long attrs)
+{
+	return -EINVAL;
+}
+
+static void fast_smmu_unmap_sg(struct device *dev,
+			       struct scatterlist *sg, int nents,
+			       enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	WARN_ON_ONCE(1);
+}
+
+static void __fast_smmu_free_pages(struct page **pages, int count)
+{
+	int i;
+
+	for (i = 0; i < count; i++)
+		__free_page(pages[i]);
+	kvfree(pages);
+}
+
+static struct page **__fast_smmu_alloc_pages(unsigned int count, gfp_t gfp)
+{
+	struct page **pages;
+	unsigned int i = 0, array_size = count * sizeof(*pages);
+
+	if (array_size <= PAGE_SIZE)
+		pages = kzalloc(array_size, GFP_KERNEL);
+	else
+		pages = vzalloc(array_size);
+	if (!pages)
+		return NULL;
+
+	/* IOMMU can map any pages, so himem can also be used here */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+	for (i = 0; i < count; ++i) {
+		struct page *page = alloc_page(gfp);
+
+		if (!page) {
+			__fast_smmu_free_pages(pages, i);
+			return NULL;
+		}
+		pages[i] = page;
+	}
+	return pages;
+}
+
+static void *fast_smmu_alloc(struct device *dev, size_t size,
+			     dma_addr_t *handle, gfp_t gfp,
+			     unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev->archdata.mapping->fast;
+	struct sg_table sgt;
+	dma_addr_t dma_addr, iova_iter;
+	void *addr;
+	av8l_fast_iopte *ptep;
+	unsigned long flags;
+	struct sg_mapping_iter miter;
+	unsigned int count = ALIGN(size, SZ_4K) >> PAGE_SHIFT;
+	int prot = IOMMU_READ | IOMMU_WRITE; /* TODO: extract from attrs */
+	pgprot_t remap_prot = pgprot_writecombine(PAGE_KERNEL);
+	struct page **pages;
+
+	*handle = DMA_ERROR_CODE;
+
+	pages = __fast_smmu_alloc_pages(count, gfp);
+	if (!pages) {
+		dev_err(dev, "no pages\n");
+		return NULL;
+	}
+
+	size = ALIGN(size, SZ_4K);
+	if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, gfp)) {
+		dev_err(dev, "no sg tablen\n");
+		goto out_free_pages;
+	}
+
+	if (!(prot & IOMMU_CACHE)) {
+		/*
+		 * The CPU-centric flushing implied by SG_MITER_TO_SG isn't
+		 * sufficient here, so skip it by using the "wrong" direction.
+		 */
+		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents,
+			       SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter))
+			__dma_flush_range(miter.addr,
+					  miter.addr + miter.length);
+		sg_miter_stop(&miter);
+	}
+
+	spin_lock_irqsave(&mapping->lock, flags);
+	dma_addr = __fast_smmu_alloc_iova(mapping, size);
+	if (dma_addr == DMA_ERROR_CODE) {
+		dev_err(dev, "no iova\n");
+		spin_unlock_irqrestore(&mapping->lock, flags);
+		goto out_free_sg;
+	}
+	iova_iter = dma_addr;
+	sg_miter_start(&miter, sgt.sgl, sgt.orig_nents,
+		       SG_MITER_FROM_SG | SG_MITER_ATOMIC);
+	while (sg_miter_next(&miter)) {
+		int nptes = miter.length >> FAST_PAGE_SHIFT;
+
+		ptep = iopte_pmd_offset(mapping->pgtbl_pmds, iova_iter);
+		if (unlikely(av8l_fast_map_public(
+				     ptep, page_to_phys(miter.page),
+				     miter.length, prot))) {
+			dev_err(dev, "no map public\n");
+			/* TODO: unwind previously successful mappings */
+			goto out_free_iova;
+		}
+		dmac_clean_range(ptep, ptep + nptes);
+		iova_iter += miter.length;
+	}
+	sg_miter_stop(&miter);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+
+	addr = dma_common_pages_remap(pages, size, VM_USERMAP, remap_prot,
+				      __builtin_return_address(0));
+	if (!addr) {
+		dev_err(dev, "no common pages\n");
+		goto out_unmap;
+	}
+
+	*handle = dma_addr;
+	sg_free_table(&sgt);
+	return addr;
+
+out_unmap:
+	/* need to take the lock again for page tables and iova */
+	spin_lock_irqsave(&mapping->lock, flags);
+	ptep = iopte_pmd_offset(mapping->pgtbl_pmds, dma_addr);
+	av8l_fast_unmap_public(ptep, size);
+	dmac_clean_range(ptep, ptep + count);
+out_free_iova:
+	__fast_smmu_free_iova(mapping, dma_addr, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+out_free_sg:
+	sg_free_table(&sgt);
+out_free_pages:
+	__fast_smmu_free_pages(pages, count);
+	return NULL;
+}
+
+static void fast_smmu_free(struct device *dev, size_t size,
+			   void *vaddr, dma_addr_t dma_handle,
+			   unsigned long attrs)
+{
+	struct dma_fast_smmu_mapping *mapping = dev->archdata.mapping->fast;
+	struct vm_struct *area;
+	struct page **pages;
+	size_t count = ALIGN(size, SZ_4K) >> FAST_PAGE_SHIFT;
+	av8l_fast_iopte *ptep;
+	unsigned long flags;
+
+	size = ALIGN(size, SZ_4K);
+
+	area = find_vm_area(vaddr);
+	if (WARN_ON_ONCE(!area))
+		return;
+
+	pages = area->pages;
+	dma_common_free_remap(vaddr, size, VM_USERMAP, false);
+	ptep = iopte_pmd_offset(mapping->pgtbl_pmds, dma_handle);
+	spin_lock_irqsave(&mapping->lock, flags);
+	av8l_fast_unmap_public(ptep, size);
+	dmac_clean_range(ptep, ptep + count);
+	__fast_smmu_free_iova(mapping, dma_handle, size);
+	spin_unlock_irqrestore(&mapping->lock, flags);
+	__fast_smmu_free_pages(pages, count);
+}
+
+static int fast_smmu_dma_supported(struct device *dev, u64 mask)
+{
+	return mask <= 0xffffffff;
+}
+
+static int fast_smmu_mapping_error(struct device *dev,
+				   dma_addr_t dma_addr)
+{
+	return dma_addr == DMA_ERROR_CODE;
+}
+
+static const struct dma_map_ops fast_smmu_dma_ops = {
+	.alloc = fast_smmu_alloc,
+	.free = fast_smmu_free,
+	.map_page = fast_smmu_map_page,
+	.unmap_page = fast_smmu_unmap_page,
+	.map_sg = fast_smmu_map_sg,
+	.unmap_sg = fast_smmu_unmap_sg,
+	.dma_supported = fast_smmu_dma_supported,
+	.mapping_error = fast_smmu_mapping_error,
+};
+
+/**
+ * __fast_smmu_create_mapping_sized
+ * @base: bottom of the VA range
+ * @size: size of the VA range in bytes
+ *
+ * Creates a mapping structure which holds information about used/unused IO
+ * address ranges, which is required to perform mapping with IOMMU aware
+ * functions.  The only VA range supported is [0, 4GB).
+ *
+ * The client device need to be attached to the mapping with
+ * fast_smmu_attach_device function.
+ */
+static struct dma_fast_smmu_mapping *__fast_smmu_create_mapping_sized(
+	dma_addr_t base, size_t size)
+{
+	struct dma_fast_smmu_mapping *fast;
+
+	fast = kzalloc(sizeof(struct dma_fast_smmu_mapping), GFP_KERNEL);
+	if (!fast)
+		goto err;
+
+	fast->base = base;
+	fast->size = size;
+	fast->num_4k_pages = size >> FAST_PAGE_SHIFT;
+	fast->bitmap_size = BITS_TO_LONGS(fast->num_4k_pages) * sizeof(long);
+
+	fast->bitmap = kzalloc(fast->bitmap_size, GFP_KERNEL);
+	if (!fast->bitmap)
+		goto err2;
+
+	spin_lock_init(&fast->lock);
+
+	return fast;
+err2:
+	kfree(fast);
+err:
+	return ERR_PTR(-ENOMEM);
+}
+
+
+#define PGTBL_MEM_SIZE (SZ_4K + (4 * SZ_4K) + (2048 * SZ_4K))
+
+
+/**
+ * fast_smmu_attach_device
+ * @dev: valid struct device pointer
+ * @mapping: io address space mapping structure (returned from
+ *	fast_smmu_create_mapping)
+ *
+ * Attaches specified io address space mapping to the provided device,
+ * this replaces the dma operations (dma_map_ops pointer) with the
+ * IOMMU aware version. More than one client might be attached to
+ * the same io address space mapping.
+ */
+int fast_smmu_attach_device(struct device *dev,
+			    struct dma_iommu_mapping *mapping)
+{
+	int atomic_domain = 1;
+	struct iommu_domain *domain = mapping->domain;
+	struct iommu_pgtbl_info info;
+	size_t size = mapping->bits << PAGE_SHIFT;
+
+	if (mapping->base + size > (SZ_1G * 4ULL))
+		return -EINVAL;
+
+	if (iommu_domain_set_attr(domain, DOMAIN_ATTR_ATOMIC,
+				  &atomic_domain))
+		return -EINVAL;
+
+	mapping->fast = __fast_smmu_create_mapping_sized(mapping->base, size);
+	if (IS_ERR(mapping->fast))
+		return -ENOMEM;
+	mapping->fast->domain = domain;
+	mapping->fast->dev = dev;
+
+	if (iommu_attach_device(domain, dev))
+		return -EINVAL;
+
+	if (iommu_domain_get_attr(domain, DOMAIN_ATTR_PGTBL_INFO,
+				  &info)) {
+		dev_err(dev, "Couldn't get page table info\n");
+		fast_smmu_detach_device(dev, mapping);
+		return -EINVAL;
+	}
+	mapping->fast->pgtbl_pmds = info.pmds;
+
+	dev->archdata.mapping = mapping;
+	set_dma_ops(dev, &fast_smmu_dma_ops);
+
+	return 0;
+}
+EXPORT_SYMBOL(fast_smmu_attach_device);
+
+/**
+ * fast_smmu_detach_device
+ * @dev: valid struct device pointer
+ *
+ * Detaches the provided device from a previously attached map.
+ * This voids the dma operations (dma_map_ops pointer)
+ */
+void fast_smmu_detach_device(struct device *dev,
+			     struct dma_iommu_mapping *mapping)
+{
+	iommu_detach_device(mapping->domain, dev);
+	dev->archdata.mapping = NULL;
+	set_dma_ops(dev, NULL);
+
+	kfree(mapping->fast->bitmap);
+	kfree(mapping->fast);
+}
+EXPORT_SYMBOL(fast_smmu_detach_device);
diff --git a/include/linux/dma-mapping-fast.h b/include/linux/dma-mapping-fast.h
new file mode 100644
index 0000000..ad82efc
--- /dev/null
+++ b/include/linux/dma-mapping-fast.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_DMA_MAPPING_FAST_H
+#define __LINUX_DMA_MAPPING_FAST_H
+
+#include <linux/iommu.h>
+#include <linux/io-pgtable-fast.h>
+
+struct dma_fast_smmu_mapping {
+	struct device		*dev;
+	struct iommu_domain	*domain;
+	dma_addr_t	 base;
+	size_t		 size;
+	size_t		 num_4k_pages;
+
+	unsigned int	bitmap_size;
+	unsigned long	*bitmap;
+	unsigned long	next_start;
+	unsigned long	upcoming_stale_bit;
+	bool		have_stale_tlbs;
+
+	dma_addr_t	pgtbl_dma_handle;
+	av8l_fast_iopte	*pgtbl_pmds;
+
+	spinlock_t	lock;
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+int fast_smmu_attach_device(struct device *dev,
+			    struct dma_iommu_mapping *mapping);
+void fast_smmu_detach_device(struct device *dev,
+			     struct dma_iommu_mapping *mapping);
+#else
+static inline int fast_smmu_attach_device(struct device *dev,
+					  struct dma_iommu_mapping *mapping)
+{
+	return -ENODEV;
+}
+
+static inline void fast_smmu_detach_device(struct device *dev,
+					   struct dma_iommu_mapping *mapping)
+{
+}
+#endif
+
+#endif /* __LINUX_DMA_MAPPING_FAST_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 694aafa..696b3ba 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -57,6 +57,10 @@
 	bool force_aperture;       /* DMA only allowed in mappable range? */
 };
 
+struct iommu_pgtbl_info {
+	void *pmds;
+};
+
 /* Domain feature flags */
 #define __IOMMU_DOMAIN_PAGING	(1U << 0)  /* Support for iommu_map/unmap */
 #define __IOMMU_DOMAIN_DMA_API	(1U << 1)  /* Domain for use in DMA-API
@@ -130,6 +134,7 @@
 	DOMAIN_ATTR_ATOMIC,
 	DOMAIN_ATTR_SECURE_VMID,
 	DOMAIN_ATTR_FAST,
+	DOMAIN_ATTR_PGTBL_INFO,
 	DOMAIN_ATTR_MAX,
 };
 
@@ -182,6 +187,8 @@
  * @reg_read: read an IOMMU register
  * @reg_write: write an IOMMU register
  * @tlbi_domain: Invalidate all TLBs covering an iommu domain
+ * @enable_config_clocks: Enable all config clocks for this domain's IOMMU
+ * @disable_config_clocks: Disable all config clocks for this domain's IOMMU
  */
 struct iommu_ops {
 	bool (*capable)(enum iommu_cap);
@@ -229,6 +236,8 @@
 	void (*reg_write)(struct iommu_domain *domain, unsigned long val,
 			  unsigned long offset);
 	void (*tlbi_domain)(struct iommu_domain *domain);
+	int (*enable_config_clocks)(struct iommu_domain *domain);
+	void (*disable_config_clocks)(struct iommu_domain *domain);
 
 	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
 
@@ -384,6 +393,19 @@
 		domain->ops->tlbi_domain(domain);
 }
 
+static inline int iommu_enable_config_clocks(struct iommu_domain *domain)
+{
+	if (domain->ops->enable_config_clocks)
+		return domain->ops->enable_config_clocks(domain);
+	return 0;
+}
+
+static inline void iommu_disable_config_clocks(struct iommu_domain *domain)
+{
+	if (domain->ops->disable_config_clocks)
+		domain->ops->disable_config_clocks(domain);
+}
+
 #else /* CONFIG_IOMMU_API */
 
 struct iommu_ops {};
@@ -620,6 +642,15 @@
 {
 }
 
+static inline int iommu_enable_config_clocks(struct iommu_domain *domain)
+{
+	return 0;
+}
+
+static inline void iommu_disable_config_clocks(struct iommu_domain *domain)
+{
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */