Merge tag 'iommu-updates-v4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull IOMMU updates from Joerg Roedel:
 "The updates include:

   - Small code cleanups in the AMD IOMMUv2 driver

   - Scalability improvements for the DMA-API implementation of the AMD
     IOMMU driver.  This is just a starting point, but already showed
     some good improvements in my tests.

   - Removal of the unused Renesas IPMMU/IPMMUI driver

   - Updates for ARM-SMMU include:
      * Some fixes to get the driver working nicely on Broadcom hardware
      * A change to the io-pgtable API to indicate the unit in which to
        flush (all callers converted, with Ack from Laurent)
      * Use of devm_* for allocating/freeing the SMMUv3 buffers

   - Some other small fixes and improvements for other drivers"

* tag 'iommu-updates-v4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: (46 commits)
  iommu/vt-d: Fix up error handling in alloc_iommu
  iommu/vt-d: Check the return value of iommu_device_create()
  iommu/amd: Remove an unneeded condition
  iommu/amd: Preallocate dma_ops apertures based on dma_mask
  iommu/amd: Use trylock to aquire bitmap_lock
  iommu/amd: Make dma_ops_domain->next_index percpu
  iommu/amd: Relax locking in dma_ops path
  iommu/amd: Initialize new aperture range before making it visible
  iommu/amd: Build io page-tables with cmpxchg64
  iommu/amd: Allocate new aperture ranges in dma_ops_alloc_addresses
  iommu/amd: Optimize dma_ops_free_addresses
  iommu/amd: Remove need_flush from struct dma_ops_domain
  iommu/amd: Iterate over all aperture ranges in dma_ops_area_alloc
  iommu/amd: Flush iommu tlb in dma_ops_free_addresses
  iommu/amd: Rename dma_ops_domain->next_address to next_index
  iommu/amd: Remove 'start' parameter from dma_ops_area_alloc
  iommu/amd: Flush iommu tlb in dma_ops_aperture_alloc()
  iommu/amd: Retry address allocation within one aperture
  iommu/amd: Move aperture_range.offset to another cache-line
  iommu/amd: Add dma_ops_aperture_alloc() function
  ...
diff --git a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt
index cd29083..48ffb38 100644
--- a/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt
+++ b/Documentation/devicetree/bindings/iommu/renesas,ipmmu-vmsa.txt
@@ -7,7 +7,15 @@
 
 Required Properties:
 
-  - compatible: Must contain "renesas,ipmmu-vmsa".
+  - compatible: Must contain SoC-specific and generic entries from below.
+
+    - "renesas,ipmmu-r8a73a4" for the R8A73A4 (R-Mobile APE6) IPMMU.
+    - "renesas,ipmmu-r8a7790" for the R8A7790 (R-Car H2) IPMMU.
+    - "renesas,ipmmu-r8a7791" for the R8A7791 (R-Car M2-W) IPMMU.
+    - "renesas,ipmmu-r8a7793" for the R8A7793 (R-Car M2-N) IPMMU.
+    - "renesas,ipmmu-r8a7794" for the R8A7794 (R-Car E2) IPMMU.
+    - "renesas,ipmmu-vmsa" for generic R-Car Gen2 VMSA-compatible IPMMU.
+
   - reg: Base address and size of the IPMMU registers.
   - interrupts: Specifiers for the MMU fault interrupts. For instances that
     support secure mode two interrupts must be specified, for non-secure and
@@ -27,7 +35,7 @@
 Example: R8A7791 IPMMU-MX and VSP1-D0 bus master
 
 	ipmmu_mx: mmu@fe951000 {
-		compatible = "renasas,ipmmu-vmsa";
+		compatible = "renasas,ipmmu-r8a7791", "renasas,ipmmu-vmsa";
 		reg = <0 0xfe951000 0 0x1000>;
 		interrupts = <0 222 IRQ_TYPE_LEVEL_HIGH>,
 			     <0 221 IRQ_TYPE_LEVEL_HIGH>;
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index b9094e9..a1e75cb 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -263,81 +263,6 @@
 
 	  Say N unless you need kernel log message for IOMMU debugging.
 
-config SHMOBILE_IPMMU
-	bool
-
-config SHMOBILE_IPMMU_TLB
-	bool
-
-config SHMOBILE_IOMMU
-	bool "IOMMU for Renesas IPMMU/IPMMUI"
-	default n
-	depends on ARM && MMU
-	depends on ARCH_SHMOBILE || COMPILE_TEST
-	select IOMMU_API
-	select ARM_DMA_USE_IOMMU
-	select SHMOBILE_IPMMU
-	select SHMOBILE_IPMMU_TLB
-	help
-	  Support for Renesas IPMMU/IPMMUI. This option enables
-	  remapping of DMA memory accesses from all of the IP blocks
-	  on the ICB.
-
-	  Warning: Drivers (including userspace drivers of UIO
-	  devices) of the IP blocks on the ICB *must* use addresses
-	  allocated from the IPMMU (iova) for DMA with this option
-	  enabled.
-
-	  If unsure, say N.
-
-choice
-	prompt "IPMMU/IPMMUI address space size"
-	default SHMOBILE_IOMMU_ADDRSIZE_2048MB
-	depends on SHMOBILE_IOMMU
-	help
-	  This option sets IPMMU/IPMMUI address space size by
-	  adjusting the 1st level page table size. The page table size
-	  is calculated as follows:
-
-	      page table size = number of page table entries * 4 bytes
-	      number of page table entries = address space size / 1 MiB
-
-	  For example, when the address space size is 2048 MiB, the
-	  1st level page table size is 8192 bytes.
-
-	config SHMOBILE_IOMMU_ADDRSIZE_2048MB
-		bool "2 GiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_1024MB
-		bool "1 GiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_512MB
-		bool "512 MiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_256MB
-		bool "256 MiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_128MB
-		bool "128 MiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_64MB
-		bool "64 MiB"
-
-	config SHMOBILE_IOMMU_ADDRSIZE_32MB
-		bool "32 MiB"
-
-endchoice
-
-config SHMOBILE_IOMMU_L1SIZE
-	int
-	default 8192 if SHMOBILE_IOMMU_ADDRSIZE_2048MB
-	default 4096 if SHMOBILE_IOMMU_ADDRSIZE_1024MB
-	default 2048 if SHMOBILE_IOMMU_ADDRSIZE_512MB
-	default 1024 if SHMOBILE_IOMMU_ADDRSIZE_256MB
-	default 512 if SHMOBILE_IOMMU_ADDRSIZE_128MB
-	default 256 if SHMOBILE_IOMMU_ADDRSIZE_64MB
-	default 128 if SHMOBILE_IOMMU_ADDRSIZE_32MB
-
 config IPMMU_VMSA
 	bool "Renesas VMSA-compatible IPMMU"
 	depends on ARM_LPAE
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 68faca02..42fc0c2 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -22,7 +22,5 @@
 obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o
 obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
 obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
-obj-$(CONFIG_SHMOBILE_IOMMU) += shmobile-iommu.o
-obj-$(CONFIG_SHMOBILE_IPMMU) += shmobile-ipmmu.o
 obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 8b2be1e..539b0de 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -35,6 +35,7 @@
 #include <linux/msi.h>
 #include <linux/dma-contiguous.h>
 #include <linux/irqdomain.h>
+#include <linux/percpu.h>
 #include <asm/irq_remapping.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
@@ -114,6 +115,45 @@
 static void update_domain(struct protection_domain *domain);
 static int protection_domain_init(struct protection_domain *domain);
 
+/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+	spinlock_t bitmap_lock;
+
+	/* address allocation bitmap */
+	unsigned long *bitmap;
+	unsigned long offset;
+	unsigned long next_bit;
+
+	/*
+	 * Array of PTE pages for the aperture. In this array we save all the
+	 * leaf pages of the domain page table used for the aperture. This way
+	 * we don't need to walk the page table to find a specific PTE. We can
+	 * just calculate its address in constant time.
+	 */
+	u64 *pte_pages[64];
+};
+
+/*
+ * Data container for a dma_ops specific protection domain
+ */
+struct dma_ops_domain {
+	/* generic protection domain information */
+	struct protection_domain domain;
+
+	/* size of the aperture for the mappings */
+	unsigned long aperture_size;
+
+	/* aperture index we start searching for free addresses */
+	u32 __percpu *next_index;
+
+	/* address space relevant data */
+	struct aperture_range *aperture[APERTURE_MAX_RANGES];
+};
+
 /****************************************************************************
  *
  * Helper functions
@@ -1167,11 +1207,21 @@
 	end_lvl = PAGE_SIZE_LEVEL(page_size);
 
 	while (level > end_lvl) {
-		if (!IOMMU_PTE_PRESENT(*pte)) {
+		u64 __pte, __npte;
+
+		__pte = *pte;
+
+		if (!IOMMU_PTE_PRESENT(__pte)) {
 			page = (u64 *)get_zeroed_page(gfp);
 			if (!page)
 				return NULL;
-			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+
+			__npte = PM_LEVEL_PDE(level, virt_to_phys(page));
+
+			if (cmpxchg64(pte, __pte, __npte)) {
+				free_page((unsigned long)page);
+				continue;
+			}
 		}
 
 		/* No level skipping support yet */
@@ -1376,8 +1426,10 @@
 			   bool populate, gfp_t gfp)
 {
 	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	struct amd_iommu *iommu;
 	unsigned long i, old_size, pte_pgsize;
+	struct aperture_range *range;
+	struct amd_iommu *iommu;
+	unsigned long flags;
 
 #ifdef CONFIG_IOMMU_STRESS
 	populate = false;
@@ -1386,15 +1438,17 @@
 	if (index >= APERTURE_MAX_RANGES)
 		return -ENOMEM;
 
-	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
-	if (!dma_dom->aperture[index])
+	range = kzalloc(sizeof(struct aperture_range), gfp);
+	if (!range)
 		return -ENOMEM;
 
-	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
-	if (!dma_dom->aperture[index]->bitmap)
+	range->bitmap = (void *)get_zeroed_page(gfp);
+	if (!range->bitmap)
 		goto out_free;
 
-	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+	range->offset = dma_dom->aperture_size;
+
+	spin_lock_init(&range->bitmap_lock);
 
 	if (populate) {
 		unsigned long address = dma_dom->aperture_size;
@@ -1407,14 +1461,20 @@
 			if (!pte)
 				goto out_free;
 
-			dma_dom->aperture[index]->pte_pages[i] = pte_page;
+			range->pte_pages[i] = pte_page;
 
 			address += APERTURE_RANGE_SIZE / 64;
 		}
 	}
 
-	old_size                = dma_dom->aperture_size;
-	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+	spin_lock_irqsave(&dma_dom->domain.lock, flags);
+
+	/* First take the bitmap_lock and then publish the range */
+	spin_lock(&range->bitmap_lock);
+
+	old_size                 = dma_dom->aperture_size;
+	dma_dom->aperture[index] = range;
+	dma_dom->aperture_size  += APERTURE_RANGE_SIZE;
 
 	/* Reserve address range used for MSI messages */
 	if (old_size < MSI_ADDR_BASE_LO &&
@@ -1461,62 +1521,123 @@
 
 	update_domain(&dma_dom->domain);
 
+	spin_unlock(&range->bitmap_lock);
+
+	spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
+
 	return 0;
 
 out_free:
 	update_domain(&dma_dom->domain);
 
-	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+	free_page((unsigned long)range->bitmap);
 
-	kfree(dma_dom->aperture[index]);
-	dma_dom->aperture[index] = NULL;
+	kfree(range);
 
 	return -ENOMEM;
 }
 
+static dma_addr_t dma_ops_aperture_alloc(struct dma_ops_domain *dom,
+					 struct aperture_range *range,
+					 unsigned long pages,
+					 unsigned long dma_mask,
+					 unsigned long boundary_size,
+					 unsigned long align_mask,
+					 bool trylock)
+{
+	unsigned long offset, limit, flags;
+	dma_addr_t address;
+	bool flush = false;
+
+	offset = range->offset >> PAGE_SHIFT;
+	limit  = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+					dma_mask >> PAGE_SHIFT);
+
+	if (trylock) {
+		if (!spin_trylock_irqsave(&range->bitmap_lock, flags))
+			return -1;
+	} else {
+		spin_lock_irqsave(&range->bitmap_lock, flags);
+	}
+
+	address = iommu_area_alloc(range->bitmap, limit, range->next_bit,
+				   pages, offset, boundary_size, align_mask);
+	if (address == -1) {
+		/* Nothing found, retry one time */
+		address = iommu_area_alloc(range->bitmap, limit,
+					   0, pages, offset, boundary_size,
+					   align_mask);
+		flush = true;
+	}
+
+	if (address != -1)
+		range->next_bit = address + pages;
+
+	spin_unlock_irqrestore(&range->bitmap_lock, flags);
+
+	if (flush) {
+		domain_flush_tlb(&dom->domain);
+		domain_flush_complete(&dom->domain);
+	}
+
+	return address;
+}
+
 static unsigned long dma_ops_area_alloc(struct device *dev,
 					struct dma_ops_domain *dom,
 					unsigned int pages,
 					unsigned long align_mask,
-					u64 dma_mask,
-					unsigned long start)
+					u64 dma_mask)
 {
-	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
-	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	int i = start >> APERTURE_RANGE_SHIFT;
 	unsigned long boundary_size, mask;
 	unsigned long address = -1;
-	unsigned long limit;
+	bool first = true;
+	u32 start, i;
 
-	next_bit >>= PAGE_SHIFT;
+	preempt_disable();
 
 	mask = dma_get_seg_boundary(dev);
 
+again:
+	start = this_cpu_read(*dom->next_index);
+
+	/* Sanity check - is it really necessary? */
+	if (unlikely(start > APERTURE_MAX_RANGES)) {
+		start = 0;
+		this_cpu_write(*dom->next_index, 0);
+	}
+
 	boundary_size = mask + 1 ? ALIGN(mask + 1, PAGE_SIZE) >> PAGE_SHIFT :
 				   1UL << (BITS_PER_LONG - PAGE_SHIFT);
 
-	for (;i < max_index; ++i) {
-		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+		struct aperture_range *range;
+		int index;
 
-		if (dom->aperture[i]->offset >= dma_mask)
-			break;
+		index = (start + i) % APERTURE_MAX_RANGES;
 
-		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
-					       dma_mask >> PAGE_SHIFT);
+		range = dom->aperture[index];
 
-		address = iommu_area_alloc(dom->aperture[i]->bitmap,
-					   limit, next_bit, pages, 0,
-					    boundary_size, align_mask);
+		if (!range || range->offset >= dma_mask)
+			continue;
+
+		address = dma_ops_aperture_alloc(dom, range, pages,
+						 dma_mask, boundary_size,
+						 align_mask, first);
 		if (address != -1) {
-			address = dom->aperture[i]->offset +
-				  (address << PAGE_SHIFT);
-			dom->next_address = address + (pages << PAGE_SHIFT);
+			address = range->offset + (address << PAGE_SHIFT);
+			this_cpu_write(*dom->next_index, index);
 			break;
 		}
-
-		next_bit = 0;
 	}
 
+	if (address == -1 && first) {
+		first = false;
+		goto again;
+	}
+
+	preempt_enable();
+
 	return address;
 }
 
@@ -1526,21 +1647,14 @@
 					     unsigned long align_mask,
 					     u64 dma_mask)
 {
-	unsigned long address;
+	unsigned long address = -1;
 
-#ifdef CONFIG_IOMMU_STRESS
-	dom->next_address = 0;
-	dom->need_flush = true;
-#endif
+	while (address == -1) {
+		address = dma_ops_area_alloc(dev, dom, pages,
+					     align_mask, dma_mask);
 
-	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-				     dma_mask, dom->next_address);
-
-	if (address == -1) {
-		dom->next_address = 0;
-		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-					     dma_mask, 0);
-		dom->need_flush = true;
+		if (address == -1 && alloc_new_range(dom, false, GFP_ATOMIC))
+			break;
 	}
 
 	if (unlikely(address == -1))
@@ -1562,6 +1676,7 @@
 {
 	unsigned i = address >> APERTURE_RANGE_SHIFT;
 	struct aperture_range *range = dom->aperture[i];
+	unsigned long flags;
 
 	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
@@ -1570,12 +1685,18 @@
 		return;
 #endif
 
-	if (address >= dom->next_address)
-		dom->need_flush = true;
+	if (amd_iommu_unmap_flush) {
+		domain_flush_tlb(&dom->domain);
+		domain_flush_complete(&dom->domain);
+	}
 
 	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
 
+	spin_lock_irqsave(&range->bitmap_lock, flags);
+	if (address + pages > range->next_bit)
+		range->next_bit = address + pages;
 	bitmap_clear(range->bitmap, address, pages);
+	spin_unlock_irqrestore(&range->bitmap_lock, flags);
 
 }
 
@@ -1755,6 +1876,8 @@
 	if (!dom)
 		return;
 
+	free_percpu(dom->next_index);
+
 	del_domain_from_list(&dom->domain);
 
 	free_pagetable(&dom->domain);
@@ -1769,6 +1892,23 @@
 	kfree(dom);
 }
 
+static int dma_ops_domain_alloc_apertures(struct dma_ops_domain *dma_dom,
+					  int max_apertures)
+{
+	int ret, i, apertures;
+
+	apertures = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+	ret       = 0;
+
+	for (i = apertures; i < max_apertures; ++i) {
+		ret = alloc_new_range(dma_dom, false, GFP_KERNEL);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 /*
  * Allocates a new protection domain usable for the dma_ops functions.
  * It also initializes the page table and the address allocator data
@@ -1777,6 +1917,7 @@
 static struct dma_ops_domain *dma_ops_domain_alloc(void)
 {
 	struct dma_ops_domain *dma_dom;
+	int cpu;
 
 	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
 	if (!dma_dom)
@@ -1785,6 +1926,10 @@
 	if (protection_domain_init(&dma_dom->domain))
 		goto free_dma_dom;
 
+	dma_dom->next_index = alloc_percpu(u32);
+	if (!dma_dom->next_index)
+		goto free_dma_dom;
+
 	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
 	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
 	dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1792,8 +1937,6 @@
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
 
-	dma_dom->need_flush = false;
-
 	add_domain_to_list(&dma_dom->domain);
 
 	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
@@ -1804,8 +1947,9 @@
 	 * a valid dma-address. So we can use 0 as error value
 	 */
 	dma_dom->aperture[0]->bitmap[0] = 1;
-	dma_dom->next_address = 0;
 
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(dma_dom->next_index, cpu) = 0;
 
 	return dma_dom;
 
@@ -2328,7 +2472,7 @@
 	else if (direction == DMA_BIDIRECTIONAL)
 		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
 
-	WARN_ON(*pte);
+	WARN_ON_ONCE(*pte);
 
 	*pte = __pte;
 
@@ -2357,7 +2501,7 @@
 
 	pte += PM_LEVEL_INDEX(0, address);
 
-	WARN_ON(!*pte);
+	WARN_ON_ONCE(!*pte);
 
 	*pte = 0ULL;
 }
@@ -2393,26 +2537,11 @@
 	if (align)
 		align_mask = (1UL << get_order(size)) - 1;
 
-retry:
 	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
 					  dma_mask);
-	if (unlikely(address == DMA_ERROR_CODE)) {
-		/*
-		 * setting next_address here will let the address
-		 * allocator only scan the new allocated range in the
-		 * first run. This is a small optimization.
-		 */
-		dma_dom->next_address = dma_dom->aperture_size;
 
-		if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
-			goto out;
-
-		/*
-		 * aperture was successfully enlarged by 128 MB, try
-		 * allocation again
-		 */
-		goto retry;
-	}
+	if (address == DMA_ERROR_CODE)
+		goto out;
 
 	start = address;
 	for (i = 0; i < pages; ++i) {
@@ -2427,11 +2556,10 @@
 
 	ADD_STATS_COUNTER(alloced_io_mem, size);
 
-	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-		domain_flush_tlb(&dma_dom->domain);
-		dma_dom->need_flush = false;
-	} else if (unlikely(amd_iommu_np_cache))
+	if (unlikely(amd_iommu_np_cache)) {
 		domain_flush_pages(&dma_dom->domain, address, size);
+		domain_flush_complete(&dma_dom->domain);
+	}
 
 out:
 	return address;
@@ -2478,11 +2606,6 @@
 	SUB_STATS_COUNTER(alloced_io_mem, size);
 
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
-	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-		domain_flush_pages(&dma_dom->domain, flush_addr, size);
-		dma_dom->need_flush = false;
-	}
 }
 
 /*
@@ -2493,11 +2616,9 @@
 			   enum dma_data_direction dir,
 			   struct dma_attrs *attrs)
 {
-	unsigned long flags;
-	struct protection_domain *domain;
-	dma_addr_t addr;
-	u64 dma_mask;
 	phys_addr_t paddr = page_to_phys(page) + offset;
+	struct protection_domain *domain;
+	u64 dma_mask;
 
 	INC_STATS_COUNTER(cnt_map_single);
 
@@ -2509,19 +2630,8 @@
 
 	dma_mask = *dev->dma_mask;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
-	addr = __map_single(dev, domain->priv, paddr, size, dir, false,
+	return __map_single(dev, domain->priv, paddr, size, dir, false,
 			    dma_mask);
-	if (addr == DMA_ERROR_CODE)
-		goto out;
-
-	domain_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return addr;
 }
 
 /*
@@ -2530,7 +2640,6 @@
 static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
 		       enum dma_data_direction dir, struct dma_attrs *attrs)
 {
-	unsigned long flags;
 	struct protection_domain *domain;
 
 	INC_STATS_COUNTER(cnt_unmap_single);
@@ -2539,13 +2648,7 @@
 	if (IS_ERR(domain))
 		return;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
 	__unmap_single(domain->priv, dma_addr, size, dir);
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
 /*
@@ -2556,7 +2659,6 @@
 		  int nelems, enum dma_data_direction dir,
 		  struct dma_attrs *attrs)
 {
-	unsigned long flags;
 	struct protection_domain *domain;
 	int i;
 	struct scatterlist *s;
@@ -2572,8 +2674,6 @@
 
 	dma_mask = *dev->dma_mask;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
 	for_each_sg(sglist, s, nelems, i) {
 		paddr = sg_phys(s);
 
@@ -2588,12 +2688,8 @@
 			goto unmap;
 	}
 
-	domain_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
 	return mapped_elems;
+
 unmap:
 	for_each_sg(sglist, s, mapped_elems, i) {
 		if (s->dma_address)
@@ -2602,9 +2698,7 @@
 		s->dma_address = s->dma_length = 0;
 	}
 
-	mapped_elems = 0;
-
-	goto out;
+	return 0;
 }
 
 /*
@@ -2615,7 +2709,6 @@
 		     int nelems, enum dma_data_direction dir,
 		     struct dma_attrs *attrs)
 {
-	unsigned long flags;
 	struct protection_domain *domain;
 	struct scatterlist *s;
 	int i;
@@ -2626,17 +2719,11 @@
 	if (IS_ERR(domain))
 		return;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
 	for_each_sg(sglist, s, nelems, i) {
 		__unmap_single(domain->priv, s->dma_address,
 			       s->dma_length, dir);
 		s->dma_address = s->dma_length = 0;
 	}
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
 /*
@@ -2648,7 +2735,6 @@
 {
 	u64 dma_mask = dev->coherent_dma_mask;
 	struct protection_domain *domain;
-	unsigned long flags;
 	struct page *page;
 
 	INC_STATS_COUNTER(cnt_alloc_coherent);
@@ -2680,19 +2766,11 @@
 	if (!dma_mask)
 		dma_mask = *dev->dma_mask;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
 	*dma_addr = __map_single(dev, domain->priv, page_to_phys(page),
 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-	if (*dma_addr == DMA_ERROR_CODE) {
-		spin_unlock_irqrestore(&domain->lock, flags);
+	if (*dma_addr == DMA_ERROR_CODE)
 		goto out_free;
-	}
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
 
 	return page_address(page);
 
@@ -2712,7 +2790,6 @@
 			  struct dma_attrs *attrs)
 {
 	struct protection_domain *domain;
-	unsigned long flags;
 	struct page *page;
 
 	INC_STATS_COUNTER(cnt_free_coherent);
@@ -2724,14 +2801,8 @@
 	if (IS_ERR(domain))
 		goto free_mem;
 
-	spin_lock_irqsave(&domain->lock, flags);
-
 	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
 
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
 free_mem:
 	if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
 		__free_pages(page, get_order(size));
@@ -2746,14 +2817,43 @@
 	return check_device(dev);
 }
 
+static int set_dma_mask(struct device *dev, u64 mask)
+{
+	struct protection_domain *domain;
+	int max_apertures = 1;
+
+	domain = get_domain(dev);
+	if (IS_ERR(domain))
+		return PTR_ERR(domain);
+
+	if (mask == DMA_BIT_MASK(64))
+		max_apertures = 8;
+	else if (mask > DMA_BIT_MASK(32))
+		max_apertures = 4;
+
+	/*
+	 * To prevent lock contention it doesn't make sense to allocate more
+	 * apertures than online cpus
+	 */
+	if (max_apertures > num_online_cpus())
+		max_apertures = num_online_cpus();
+
+	if (dma_ops_domain_alloc_apertures(domain->priv, max_apertures))
+		dev_err(dev, "Can't allocate %d iommu apertures\n",
+			max_apertures);
+
+	return 0;
+}
+
 static struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc = alloc_coherent,
-	.free = free_coherent,
-	.map_page = map_page,
-	.unmap_page = unmap_page,
-	.map_sg = map_sg,
-	.unmap_sg = unmap_sg,
-	.dma_supported = amd_iommu_dma_supported,
+	.alloc		= alloc_coherent,
+	.free		= free_coherent,
+	.map_page	= map_page,
+	.unmap_page	= unmap_page,
+	.map_sg		= map_sg,
+	.unmap_sg	= unmap_sg,
+	.dma_supported	= amd_iommu_dma_supported,
+	.set_dma_mask	= set_dma_mask,
 };
 
 int __init amd_iommu_init_api(void)
@@ -3757,11 +3857,9 @@
 	case X86_IRQ_ALLOC_TYPE_MSI:
 	case X86_IRQ_ALLOC_TYPE_MSIX:
 		devid = get_device_id(&info->msi_dev->dev);
-		if (devid >= 0) {
-			iommu = amd_iommu_rlookup_table[devid];
-			if (iommu)
-				return iommu->msi_domain;
-		}
+		iommu = amd_iommu_rlookup_table[devid];
+		if (iommu)
+			return iommu->msi_domain;
 		break;
 	default:
 		break;
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index b08cf57..9d32b20 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -425,46 +425,6 @@
 };
 
 /*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
-	/* address allocation bitmap */
-	unsigned long *bitmap;
-
-	/*
-	 * Array of PTE pages for the aperture. In this array we save all the
-	 * leaf pages of the domain page table used for the aperture. This way
-	 * we don't need to walk the page table to find a specific PTE. We can
-	 * just calculate its address in constant time.
-	 */
-	u64 *pte_pages[64];
-
-	unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
-	/* generic protection domain information */
-	struct protection_domain domain;
-
-	/* size of the aperture for the mappings */
-	unsigned long aperture_size;
-
-	/* address we start to search for free addresses */
-	unsigned long next_address;
-
-	/* address space relevant data */
-	struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
-	/* This will be set to true when TLB needs to be flushed */
-	bool need_flush;
-};
-
-/*
  * Structure where we save information about one hardware AMD IOMMU in the
  * system.
  */
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 7caf2fa..c865737 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -432,7 +432,7 @@
 	unbind_pasid(pasid_state);
 }
 
-static struct mmu_notifier_ops iommu_mn = {
+static const struct mmu_notifier_ops iommu_mn = {
 	.release		= mn_release,
 	.clear_flush_young      = mn_clear_flush_young,
 	.invalidate_page        = mn_invalidate_page,
@@ -513,43 +513,39 @@
 static void do_fault(struct work_struct *work)
 {
 	struct fault *fault = container_of(work, struct fault, work);
-	struct mm_struct *mm;
 	struct vm_area_struct *vma;
+	int ret = VM_FAULT_ERROR;
+	unsigned int flags = 0;
+	struct mm_struct *mm;
 	u64 address;
-	int ret, write;
-
-	write = !!(fault->flags & PPR_FAULT_WRITE);
 
 	mm = fault->state->mm;
 	address = fault->address;
 
+	if (fault->flags & PPR_FAULT_USER)
+		flags |= FAULT_FLAG_USER;
+	if (fault->flags & PPR_FAULT_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
 	down_read(&mm->mmap_sem);
 	vma = find_extend_vma(mm, address);
-	if (!vma || address < vma->vm_start) {
+	if (!vma || address < vma->vm_start)
 		/* failed to get a vma in the right range */
-		up_read(&mm->mmap_sem);
-		handle_fault_error(fault);
 		goto out;
-	}
 
 	/* Check if we have the right permissions on the vma */
-	if (access_error(vma, fault)) {
-		up_read(&mm->mmap_sem);
-		handle_fault_error(fault);
+	if (access_error(vma, fault))
 		goto out;
-	}
 
-	ret = handle_mm_fault(mm, vma, address, write);
-	if (ret & VM_FAULT_ERROR) {
-		/* failed to service fault */
-		up_read(&mm->mmap_sem);
-		handle_fault_error(fault);
-		goto out;
-	}
-
-	up_read(&mm->mmap_sem);
+	ret = handle_mm_fault(mm, vma, address, flags);
 
 out:
+	up_read(&mm->mmap_sem);
+
+	if (ret & VM_FAULT_ERROR)
+		/* failed to service fault */
+		handle_fault_error(fault);
+
 	finish_pri_tag(fault->dev_state, fault->state, fault->tag);
 
 	put_pasid_state(fault->state);
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4e5118a..2087534 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -40,7 +40,10 @@
 #define IDR0_ST_LVL_SHIFT		27
 #define IDR0_ST_LVL_MASK		0x3
 #define IDR0_ST_LVL_2LVL		(1 << IDR0_ST_LVL_SHIFT)
-#define IDR0_STALL_MODEL		(3 << 24)
+#define IDR0_STALL_MODEL_SHIFT		24
+#define IDR0_STALL_MODEL_MASK		0x3
+#define IDR0_STALL_MODEL_STALL		(0 << IDR0_STALL_MODEL_SHIFT)
+#define IDR0_STALL_MODEL_FORCE		(2 << IDR0_STALL_MODEL_SHIFT)
 #define IDR0_TTENDIAN_SHIFT		21
 #define IDR0_TTENDIAN_MASK		0x3
 #define IDR0_TTENDIAN_LE		(2 << IDR0_TTENDIAN_SHIFT)
@@ -253,6 +256,9 @@
 #define STRTAB_STE_1_STRW_EL2		2UL
 #define STRTAB_STE_1_STRW_SHIFT		30
 
+#define STRTAB_STE_1_SHCFG_INCOMING	1UL
+#define STRTAB_STE_1_SHCFG_SHIFT	44
+
 #define STRTAB_STE_2_S2VMID_SHIFT	0
 #define STRTAB_STE_2_S2VMID_MASK	0xffffUL
 #define STRTAB_STE_2_VTCR_SHIFT		32
@@ -378,7 +384,6 @@
 #define PRIQ_0_SID_MASK			0xffffffffUL
 #define PRIQ_0_SSID_SHIFT		32
 #define PRIQ_0_SSID_MASK		0xfffffUL
-#define PRIQ_0_OF			(1UL << 57)
 #define PRIQ_0_PERM_PRIV		(1UL << 58)
 #define PRIQ_0_PERM_EXEC		(1UL << 59)
 #define PRIQ_0_PERM_READ		(1UL << 60)
@@ -855,15 +860,17 @@
 	};
 
 	dev_err(smmu->dev, "CMDQ error (cons 0x%08x): %s\n", cons,
-		cerror_str[idx]);
+		idx < ARRAY_SIZE(cerror_str) ?  cerror_str[idx] : "Unknown");
 
 	switch (idx) {
-	case CMDQ_ERR_CERROR_ILL_IDX:
-		break;
 	case CMDQ_ERR_CERROR_ABT_IDX:
 		dev_err(smmu->dev, "retrying command fetch\n");
 	case CMDQ_ERR_CERROR_NONE_IDX:
 		return;
+	case CMDQ_ERR_CERROR_ILL_IDX:
+		/* Fallthrough */
+	default:
+		break;
 	}
 
 	/*
@@ -1042,6 +1049,8 @@
 		val |= disable_bypass ? STRTAB_STE_0_CFG_ABORT
 				      : STRTAB_STE_0_CFG_BYPASS;
 		dst[0] = cpu_to_le64(val);
+		dst[1] = cpu_to_le64(STRTAB_STE_1_SHCFG_INCOMING
+			 << STRTAB_STE_1_SHCFG_SHIFT);
 		dst[2] = 0; /* Nuke the VMID */
 		if (ste_live)
 			arm_smmu_sync_ste_for_sid(smmu, sid);
@@ -1056,12 +1065,14 @@
 			 STRTAB_STE_1_S1C_CACHE_WBRA
 			 << STRTAB_STE_1_S1COR_SHIFT |
 			 STRTAB_STE_1_S1C_SH_ISH << STRTAB_STE_1_S1CSH_SHIFT |
-			 STRTAB_STE_1_S1STALLD |
 #ifdef CONFIG_PCI_ATS
 			 STRTAB_STE_1_EATS_TRANS << STRTAB_STE_1_EATS_SHIFT |
 #endif
 			 STRTAB_STE_1_STRW_NSEL1 << STRTAB_STE_1_STRW_SHIFT);
 
+		if (smmu->features & ARM_SMMU_FEAT_STALLS)
+			dst[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
+
 		val |= (ste->s1_cfg->cdptr_dma & STRTAB_STE_0_S1CTXPTR_MASK
 		        << STRTAB_STE_0_S1CTXPTR_SHIFT) |
 			STRTAB_STE_0_CFG_S1_TRANS;
@@ -1123,8 +1134,8 @@
 	strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
 
 	desc->span = STRTAB_SPLIT + 1;
-	desc->l2ptr = dma_zalloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
-					  GFP_KERNEL);
+	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
+					  GFP_KERNEL | __GFP_ZERO);
 	if (!desc->l2ptr) {
 		dev_err(smmu->dev,
 			"failed to allocate l2 stream table for SID %u\n",
@@ -1250,50 +1261,50 @@
 
 static irqreturn_t arm_smmu_gerror_handler(int irq, void *dev)
 {
-	u32 gerror, gerrorn;
+	u32 gerror, gerrorn, active;
 	struct arm_smmu_device *smmu = dev;
 
 	gerror = readl_relaxed(smmu->base + ARM_SMMU_GERROR);
 	gerrorn = readl_relaxed(smmu->base + ARM_SMMU_GERRORN);
 
-	gerror ^= gerrorn;
-	if (!(gerror & GERROR_ERR_MASK))
+	active = gerror ^ gerrorn;
+	if (!(active & GERROR_ERR_MASK))
 		return IRQ_NONE; /* No errors pending */
 
 	dev_warn(smmu->dev,
 		 "unexpected global error reported (0x%08x), this could be serious\n",
-		 gerror);
+		 active);
 
-	if (gerror & GERROR_SFM_ERR) {
+	if (active & GERROR_SFM_ERR) {
 		dev_err(smmu->dev, "device has entered Service Failure Mode!\n");
 		arm_smmu_device_disable(smmu);
 	}
 
-	if (gerror & GERROR_MSI_GERROR_ABT_ERR)
+	if (active & GERROR_MSI_GERROR_ABT_ERR)
 		dev_warn(smmu->dev, "GERROR MSI write aborted\n");
 
-	if (gerror & GERROR_MSI_PRIQ_ABT_ERR) {
+	if (active & GERROR_MSI_PRIQ_ABT_ERR) {
 		dev_warn(smmu->dev, "PRIQ MSI write aborted\n");
 		arm_smmu_priq_handler(irq, smmu->dev);
 	}
 
-	if (gerror & GERROR_MSI_EVTQ_ABT_ERR) {
+	if (active & GERROR_MSI_EVTQ_ABT_ERR) {
 		dev_warn(smmu->dev, "EVTQ MSI write aborted\n");
 		arm_smmu_evtq_handler(irq, smmu->dev);
 	}
 
-	if (gerror & GERROR_MSI_CMDQ_ABT_ERR) {
+	if (active & GERROR_MSI_CMDQ_ABT_ERR) {
 		dev_warn(smmu->dev, "CMDQ MSI write aborted\n");
 		arm_smmu_cmdq_sync_handler(irq, smmu->dev);
 	}
 
-	if (gerror & GERROR_PRIQ_ABT_ERR)
+	if (active & GERROR_PRIQ_ABT_ERR)
 		dev_err(smmu->dev, "PRIQ write aborted -- events may have been lost\n");
 
-	if (gerror & GERROR_EVTQ_ABT_ERR)
+	if (active & GERROR_EVTQ_ABT_ERR)
 		dev_err(smmu->dev, "EVTQ write aborted -- events may have been lost\n");
 
-	if (gerror & GERROR_CMDQ_ERR)
+	if (active & GERROR_CMDQ_ERR)
 		arm_smmu_cmdq_skip_err(smmu);
 
 	writel(gerror, smmu->base + ARM_SMMU_GERRORN);
@@ -1335,7 +1346,7 @@
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
-					  bool leaf, void *cookie)
+					  size_t granule, bool leaf, void *cookie)
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -1354,7 +1365,10 @@
 		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
 	}
 
-	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+	do {
+		arm_smmu_cmdq_issue_cmd(smmu, &cmd);
+		cmd.tlbi.addr += granule;
+	} while (size -= granule);
 }
 
 static struct iommu_gather_ops arm_smmu_gather_ops = {
@@ -1429,10 +1443,10 @@
 		struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
 
 		if (cfg->cdptr) {
-			dma_free_coherent(smmu_domain->smmu->dev,
-					  CTXDESC_CD_DWORDS << 3,
-					  cfg->cdptr,
-					  cfg->cdptr_dma);
+			dmam_free_coherent(smmu_domain->smmu->dev,
+					   CTXDESC_CD_DWORDS << 3,
+					   cfg->cdptr,
+					   cfg->cdptr_dma);
 
 			arm_smmu_bitmap_free(smmu->asid_map, cfg->cd.asid);
 		}
@@ -1457,8 +1471,9 @@
 	if (IS_ERR_VALUE(asid))
 		return asid;
 
-	cfg->cdptr = dma_zalloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
-					 &cfg->cdptr_dma, GFP_KERNEL);
+	cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
+					 &cfg->cdptr_dma,
+					 GFP_KERNEL | __GFP_ZERO);
 	if (!cfg->cdptr) {
 		dev_warn(smmu->dev, "failed to allocate context descriptor\n");
 		ret = -ENOMEM;
@@ -1804,13 +1819,13 @@
 		smmu = arm_smmu_get_for_pci_dev(pdev);
 		if (!smmu) {
 			ret = -ENOENT;
-			goto out_put_group;
+			goto out_remove_dev;
 		}
 
 		smmu_group = kzalloc(sizeof(*smmu_group), GFP_KERNEL);
 		if (!smmu_group) {
 			ret = -ENOMEM;
-			goto out_put_group;
+			goto out_remove_dev;
 		}
 
 		smmu_group->ste.valid	= true;
@@ -1826,20 +1841,20 @@
 	for (i = 0; i < smmu_group->num_sids; ++i) {
 		/* If we already know about this SID, then we're done */
 		if (smmu_group->sids[i] == sid)
-			return 0;
+			goto out_put_group;
 	}
 
 	/* Check the SID is in range of the SMMU and our stream table */
 	if (!arm_smmu_sid_in_range(smmu, sid)) {
 		ret = -ERANGE;
-		goto out_put_group;
+		goto out_remove_dev;
 	}
 
 	/* Ensure l2 strtab is initialised */
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
 		ret = arm_smmu_init_l2_strtab(smmu, sid);
 		if (ret)
-			goto out_put_group;
+			goto out_remove_dev;
 	}
 
 	/* Resize the SID array for the group */
@@ -1849,16 +1864,20 @@
 	if (!sids) {
 		smmu_group->num_sids--;
 		ret = -ENOMEM;
-		goto out_put_group;
+		goto out_remove_dev;
 	}
 
 	/* Add the new SID */
 	sids[smmu_group->num_sids - 1] = sid;
 	smmu_group->sids = sids;
-	return 0;
 
 out_put_group:
 	iommu_group_put(group);
+	return 0;
+
+out_remove_dev:
+	iommu_group_remove_device(dev);
+	iommu_group_put(group);
 	return ret;
 }
 
@@ -1937,7 +1956,7 @@
 {
 	size_t qsz = ((1 << q->max_n_shift) * dwords) << 3;
 
-	q->base = dma_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL);
+	q->base = dmam_alloc_coherent(smmu->dev, qsz, &q->base_dma, GFP_KERNEL);
 	if (!q->base) {
 		dev_err(smmu->dev, "failed to allocate queue (0x%zx bytes)\n",
 			qsz);
@@ -1957,23 +1976,6 @@
 	return 0;
 }
 
-static void arm_smmu_free_one_queue(struct arm_smmu_device *smmu,
-				    struct arm_smmu_queue *q)
-{
-	size_t qsz = ((1 << q->max_n_shift) * q->ent_dwords) << 3;
-
-	dma_free_coherent(smmu->dev, qsz, q->base, q->base_dma);
-}
-
-static void arm_smmu_free_queues(struct arm_smmu_device *smmu)
-{
-	arm_smmu_free_one_queue(smmu, &smmu->cmdq.q);
-	arm_smmu_free_one_queue(smmu, &smmu->evtq.q);
-
-	if (smmu->features & ARM_SMMU_FEAT_PRI)
-		arm_smmu_free_one_queue(smmu, &smmu->priq.q);
-}
-
 static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 {
 	int ret;
@@ -1983,49 +1985,20 @@
 	ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
 				      ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS);
 	if (ret)
-		goto out;
+		return ret;
 
 	/* evtq */
 	ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
 				      ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS);
 	if (ret)
-		goto out_free_cmdq;
+		return ret;
 
 	/* priq */
 	if (!(smmu->features & ARM_SMMU_FEAT_PRI))
 		return 0;
 
-	ret = arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
-				      ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS);
-	if (ret)
-		goto out_free_evtq;
-
-	return 0;
-
-out_free_evtq:
-	arm_smmu_free_one_queue(smmu, &smmu->evtq.q);
-out_free_cmdq:
-	arm_smmu_free_one_queue(smmu, &smmu->cmdq.q);
-out:
-	return ret;
-}
-
-static void arm_smmu_free_l2_strtab(struct arm_smmu_device *smmu)
-{
-	int i;
-	size_t size;
-	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-
-	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
-	for (i = 0; i < cfg->num_l1_ents; ++i) {
-		struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[i];
-
-		if (!desc->l2ptr)
-			continue;
-
-		dma_free_coherent(smmu->dev, size, desc->l2ptr,
-				  desc->l2ptr_dma);
-	}
+	return arm_smmu_init_one_queue(smmu, &smmu->priq.q, ARM_SMMU_PRIQ_PROD,
+				       ARM_SMMU_PRIQ_CONS, PRIQ_ENT_DWORDS);
 }
 
 static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
@@ -2054,7 +2027,6 @@
 	void *strtab;
 	u64 reg;
 	u32 size, l1size;
-	int ret;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
 	/*
@@ -2077,8 +2049,8 @@
 			 size, smmu->sid_bits);
 
 	l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
-	strtab = dma_zalloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
-				     GFP_KERNEL);
+	strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!strtab) {
 		dev_err(smmu->dev,
 			"failed to allocate l1 stream table (%u bytes)\n",
@@ -2095,13 +2067,7 @@
 		<< STRTAB_BASE_CFG_SPLIT_SHIFT;
 	cfg->strtab_base_cfg = reg;
 
-	ret = arm_smmu_init_l1_strtab(smmu);
-	if (ret)
-		dma_free_coherent(smmu->dev,
-				  l1size,
-				  strtab,
-				  cfg->strtab_dma);
-	return ret;
+	return arm_smmu_init_l1_strtab(smmu);
 }
 
 static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
@@ -2112,8 +2078,8 @@
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
 	size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
-	strtab = dma_zalloc_coherent(smmu->dev, size, &cfg->strtab_dma,
-				     GFP_KERNEL);
+	strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!strtab) {
 		dev_err(smmu->dev,
 			"failed to allocate linear stream table (%u bytes)\n",
@@ -2157,21 +2123,6 @@
 	return 0;
 }
 
-static void arm_smmu_free_strtab(struct arm_smmu_device *smmu)
-{
-	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-	u32 size = cfg->num_l1_ents;
-
-	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
-		arm_smmu_free_l2_strtab(smmu);
-		size *= STRTAB_L1_DESC_DWORDS << 3;
-	} else {
-		size *= STRTAB_STE_DWORDS * 3;
-	}
-
-	dma_free_coherent(smmu->dev, size, cfg->strtab, cfg->strtab_dma);
-}
-
 static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 {
 	int ret;
@@ -2180,21 +2131,7 @@
 	if (ret)
 		return ret;
 
-	ret = arm_smmu_init_strtab(smmu);
-	if (ret)
-		goto out_free_queues;
-
-	return 0;
-
-out_free_queues:
-	arm_smmu_free_queues(smmu);
-	return ret;
-}
-
-static void arm_smmu_free_structures(struct arm_smmu_device *smmu)
-{
-	arm_smmu_free_strtab(smmu);
-	arm_smmu_free_queues(smmu);
+	return arm_smmu_init_strtab(smmu);
 }
 
 static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
@@ -2532,8 +2469,12 @@
 		dev_warn(smmu->dev, "IDR0.COHACC overridden by dma-coherent property (%s)\n",
 			 coherent ? "true" : "false");
 
-	if (reg & IDR0_STALL_MODEL)
+	switch (reg & IDR0_STALL_MODEL_MASK << IDR0_STALL_MODEL_SHIFT) {
+	case IDR0_STALL_MODEL_STALL:
+		/* Fallthrough */
+	case IDR0_STALL_MODEL_FORCE:
 		smmu->features |= ARM_SMMU_FEAT_STALLS;
+	}
 
 	if (reg & IDR0_S1P)
 		smmu->features |= ARM_SMMU_FEAT_TRANS_S1;
@@ -2699,15 +2640,7 @@
 	platform_set_drvdata(pdev, smmu);
 
 	/* Reset the device */
-	ret = arm_smmu_device_reset(smmu);
-	if (ret)
-		goto out_free_structures;
-
-	return 0;
-
-out_free_structures:
-	arm_smmu_free_structures(smmu);
-	return ret;
+	return arm_smmu_device_reset(smmu);
 }
 
 static int arm_smmu_device_remove(struct platform_device *pdev)
@@ -2715,7 +2648,6 @@
 	struct arm_smmu_device *smmu = platform_get_drvdata(pdev);
 
 	arm_smmu_device_disable(smmu);
-	arm_smmu_free_structures(smmu);
 	return 0;
 }
 
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 47dc7a7..59ee4b8 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -582,7 +582,7 @@
 }
 
 static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
-					  bool leaf, void *cookie)
+					  size_t granule, bool leaf, void *cookie)
 {
 	struct arm_smmu_domain *smmu_domain = cookie;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
@@ -597,12 +597,18 @@
 		if (!IS_ENABLED(CONFIG_64BIT) || smmu->version == ARM_SMMU_V1) {
 			iova &= ~12UL;
 			iova |= ARM_SMMU_CB_ASID(cfg);
-			writel_relaxed(iova, reg);
+			do {
+				writel_relaxed(iova, reg);
+				iova += granule;
+			} while (size -= granule);
 #ifdef CONFIG_64BIT
 		} else {
 			iova >>= 12;
 			iova |= (u64)ARM_SMMU_CB_ASID(cfg) << 48;
-			writeq_relaxed(iova, reg);
+			do {
+				writeq_relaxed(iova, reg);
+				iova += granule >> 12;
+			} while (size -= granule);
 #endif
 		}
 #ifdef CONFIG_64BIT
@@ -610,7 +616,11 @@
 		reg = ARM_SMMU_CB_BASE(smmu) + ARM_SMMU_CB(smmu, cfg->cbndx);
 		reg += leaf ? ARM_SMMU_CB_S2_TLBIIPAS2L :
 			      ARM_SMMU_CB_S2_TLBIIPAS2;
-		writeq_relaxed(iova >> 12, reg);
+		iova >>= 12;
+		do {
+			writeq_relaxed(iova, reg);
+			iova += granule >> 12;
+		} while (size -= granule);
 #endif
 	} else {
 		reg = ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_TLBIVMID;
@@ -945,9 +955,7 @@
 		free_irq(irq, domain);
 	}
 
-	if (smmu_domain->pgtbl_ops)
-		free_io_pgtable_ops(smmu_domain->pgtbl_ops);
-
+	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 	__arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
 }
 
@@ -1357,6 +1365,7 @@
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
+	iommu_group_put(group);
 	return 0;
 }
 
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 80e3c17..62a400c 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1063,13 +1063,19 @@
 
 	raw_spin_lock_init(&iommu->register_lock);
 
-	drhd->iommu = iommu;
-
-	if (intel_iommu_enabled)
+	if (intel_iommu_enabled) {
 		iommu->iommu_dev = iommu_device_create(NULL, iommu,
 						       intel_iommu_groups,
 						       "%s", iommu->name);
 
+		if (IS_ERR(iommu->iommu_dev)) {
+			err = PTR_ERR(iommu->iommu_dev);
+			goto err_unmap;
+		}
+	}
+
+	drhd->iommu = iommu;
+
 	return 0;
 
 err_unmap:
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7df9777..8bbcbfe 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -38,9 +38,6 @@
 #define io_pgtable_to_data(x)						\
 	container_of((x), struct arm_lpae_io_pgtable, iop)
 
-#define io_pgtable_ops_to_pgtable(x)					\
-	container_of((x), struct io_pgtable, ops)
-
 #define io_pgtable_ops_to_data(x)					\
 	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
 
@@ -58,8 +55,10 @@
 	((((d)->levels - ((l) - ARM_LPAE_START_LVL(d) + 1))		\
 	  * (d)->bits_per_level) + (d)->pg_shift)
 
+#define ARM_LPAE_GRANULE(d)		(1UL << (d)->pg_shift)
+
 #define ARM_LPAE_PAGES_PER_PGD(d)					\
-	DIV_ROUND_UP((d)->pgd_size, 1UL << (d)->pg_shift)
+	DIV_ROUND_UP((d)->pgd_size, ARM_LPAE_GRANULE(d))
 
 /*
  * Calculate the index at level l used to map virtual address a using the
@@ -169,7 +168,7 @@
 /* IOPTE accessors */
 #define iopte_deref(pte,d)					\
 	(__va((pte) & ((1ULL << ARM_LPAE_MAX_ADDR_BITS) - 1)	\
-	& ~((1ULL << (d)->pg_shift) - 1)))
+	& ~(ARM_LPAE_GRANULE(d) - 1ULL)))
 
 #define iopte_type(pte,l)					\
 	(((pte) >> ARM_LPAE_PTE_TYPE_SHIFT) & ARM_LPAE_PTE_TYPE_MASK)
@@ -326,7 +325,7 @@
 	/* Grab a pointer to the next level */
 	pte = *ptep;
 	if (!pte) {
-		cptep = __arm_lpae_alloc_pages(1UL << data->pg_shift,
+		cptep = __arm_lpae_alloc_pages(ARM_LPAE_GRANULE(data),
 					       GFP_ATOMIC, cfg);
 		if (!cptep)
 			return -ENOMEM;
@@ -405,17 +404,18 @@
 	arm_lpae_iopte *start, *end;
 	unsigned long table_size;
 
-	/* Only leaf entries at the last level */
-	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
-		return;
-
 	if (lvl == ARM_LPAE_START_LVL(data))
 		table_size = data->pgd_size;
 	else
-		table_size = 1UL << data->pg_shift;
+		table_size = ARM_LPAE_GRANULE(data);
 
 	start = ptep;
-	end = (void *)ptep + table_size;
+
+	/* Only leaf entries at the last level */
+	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
+		end = ptep;
+	else
+		end = (void *)ptep + table_size;
 
 	while (ptep != end) {
 		arm_lpae_iopte pte = *ptep++;
@@ -473,7 +473,7 @@
 
 	__arm_lpae_set_pte(ptep, table, cfg);
 	iova &= ~(blk_size - 1);
-	cfg->tlb->tlb_add_flush(iova, blk_size, true, data->iop.cookie);
+	cfg->tlb->tlb_add_flush(iova, blk_size, blk_size, true, data->iop.cookie);
 	return size;
 }
 
@@ -486,11 +486,13 @@
 	void *cookie = data->iop.cookie;
 	size_t blk_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
 
+	/* Something went horribly wrong and we ran out of page table */
+	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+		return 0;
+
 	ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
 	pte = *ptep;
-
-	/* Something went horribly wrong and we ran out of page table */
-	if (WARN_ON(!pte || (lvl == ARM_LPAE_MAX_LEVELS)))
+	if (WARN_ON(!pte))
 		return 0;
 
 	/* If the size matches this level, we're in the right place */
@@ -499,12 +501,13 @@
 
 		if (!iopte_leaf(pte, lvl)) {
 			/* Also flush any partial walks */
-			tlb->tlb_add_flush(iova, size, false, cookie);
+			tlb->tlb_add_flush(iova, size, ARM_LPAE_GRANULE(data),
+					   false, cookie);
 			tlb->tlb_sync(cookie);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
 		} else {
-			tlb->tlb_add_flush(iova, size, true, cookie);
+			tlb->tlb_add_flush(iova, size, size, true, cookie);
 		}
 
 		return size;
@@ -570,7 +573,7 @@
 	return 0;
 
 found_translation:
-	iova &= ((1 << data->pg_shift) - 1);
+	iova &= (ARM_LPAE_GRANULE(data) - 1);
 	return ((phys_addr_t)iopte_to_pfn(pte,data) << data->pg_shift) | iova;
 }
 
@@ -668,7 +671,7 @@
 	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN0_SHIFT) |
 	      (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN0_SHIFT);
 
-	switch (1 << data->pg_shift) {
+	switch (ARM_LPAE_GRANULE(data)) {
 	case SZ_4K:
 		reg |= ARM_LPAE_TCR_TG0_4K;
 		break;
@@ -769,7 +772,7 @@
 
 	sl = ARM_LPAE_START_LVL(data);
 
-	switch (1 << data->pg_shift) {
+	switch (ARM_LPAE_GRANULE(data)) {
 	case SZ_4K:
 		reg |= ARM_LPAE_TCR_TG0_4K;
 		sl++; /* SL0 format is different for 4K granule size */
@@ -889,8 +892,8 @@
 	WARN_ON(cookie != cfg_cookie);
 }
 
-static void dummy_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
-				void *cookie)
+static void dummy_tlb_add_flush(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
 {
 	WARN_ON(cookie != cfg_cookie);
 	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index ac9e234..36673c8 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -26,8 +26,8 @@
  */
 struct iommu_gather_ops {
 	void (*tlb_flush_all)(void *cookie);
-	void (*tlb_add_flush)(unsigned long iova, size_t size, bool leaf,
-			      void *cookie);
+	void (*tlb_add_flush)(unsigned long iova, size_t size, size_t granule,
+			      bool leaf, void *cookie);
 	void (*tlb_sync)(void *cookie);
 };
 
@@ -131,6 +131,8 @@
 	struct io_pgtable_ops	ops;
 };
 
+#define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index dfb868e..2fdbac6 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -277,8 +277,8 @@
 	ipmmu_tlb_invalidate(domain);
 }
 
-static void ipmmu_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
-				void *cookie)
+static void ipmmu_tlb_add_flush(unsigned long iova, size_t size,
+				size_t granule, bool leaf, void *cookie)
 {
 	/* The hardware doesn't support selective TLB flush. */
 }
diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c
index b6d01f9..4b09e81 100644
--- a/drivers/iommu/msm_iommu_dev.c
+++ b/drivers/iommu/msm_iommu_dev.c
@@ -359,30 +359,19 @@
 	.remove		= msm_iommu_ctx_remove,
 };
 
+static struct platform_driver * const drivers[] = {
+	&msm_iommu_driver,
+	&msm_iommu_ctx_driver,
+};
+
 static int __init msm_iommu_driver_init(void)
 {
-	int ret;
-	ret = platform_driver_register(&msm_iommu_driver);
-	if (ret != 0) {
-		pr_err("Failed to register IOMMU driver\n");
-		goto error;
-	}
-
-	ret = platform_driver_register(&msm_iommu_ctx_driver);
-	if (ret != 0) {
-		platform_driver_unregister(&msm_iommu_driver);
-		pr_err("Failed to register IOMMU context driver\n");
-		goto error;
-	}
-
-error:
-	return ret;
+	return platform_register_drivers(drivers, ARRAY_SIZE(drivers));
 }
 
 static void __exit msm_iommu_driver_exit(void)
 {
-	platform_driver_unregister(&msm_iommu_ctx_driver);
-	platform_driver_unregister(&msm_iommu_driver);
+	platform_unregister_drivers(drivers, ARRAY_SIZE(drivers));
 }
 
 subsys_initcall(msm_iommu_driver_init);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 471ee36..a04d491 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -49,7 +49,7 @@
 	}
 }
 
-struct iommu_domain *s390_domain_alloc(unsigned domain_type)
+static struct iommu_domain *s390_domain_alloc(unsigned domain_type)
 {
 	struct s390_domain *s390_domain;
 
@@ -73,7 +73,7 @@
 	return &s390_domain->domain;
 }
 
-void s390_domain_free(struct iommu_domain *domain)
+static void s390_domain_free(struct iommu_domain *domain)
 {
 	struct s390_domain *s390_domain = to_s390_domain(domain);
 
diff --git a/drivers/iommu/shmobile-iommu.c b/drivers/iommu/shmobile-iommu.c
deleted file mode 100644
index a028751..0000000
--- a/drivers/iommu/shmobile-iommu.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * IOMMU for IPMMU/IPMMUI
- * Copyright (C) 2012  Hideki EIRAKU
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#include <linux/dma-mapping.h>
-#include <linux/io.h>
-#include <linux/iommu.h>
-#include <linux/platform_device.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <asm/dma-iommu.h>
-#include "shmobile-ipmmu.h"
-
-#define L1_SIZE CONFIG_SHMOBILE_IOMMU_L1SIZE
-#define L1_LEN (L1_SIZE / 4)
-#define L1_ALIGN L1_SIZE
-#define L2_SIZE SZ_1K
-#define L2_LEN (L2_SIZE / 4)
-#define L2_ALIGN L2_SIZE
-
-struct shmobile_iommu_domain_pgtable {
-	uint32_t *pgtable;
-	dma_addr_t handle;
-};
-
-struct shmobile_iommu_archdata {
-	struct list_head attached_list;
-	struct dma_iommu_mapping *iommu_mapping;
-	spinlock_t attach_lock;
-	struct shmobile_iommu_domain *attached;
-	int num_attached_devices;
-	struct shmobile_ipmmu *ipmmu;
-};
-
-struct shmobile_iommu_domain {
-	struct shmobile_iommu_domain_pgtable l1, l2[L1_LEN];
-	spinlock_t map_lock;
-	spinlock_t attached_list_lock;
-	struct list_head attached_list;
-	struct iommu_domain domain;
-};
-
-static struct shmobile_iommu_archdata *ipmmu_archdata;
-static struct kmem_cache *l1cache, *l2cache;
-
-static struct shmobile_iommu_domain *to_sh_domain(struct iommu_domain *dom)
-{
-	return container_of(dom, struct shmobile_iommu_domain, domain);
-}
-
-static int pgtable_alloc(struct shmobile_iommu_domain_pgtable *pgtable,
-			 struct kmem_cache *cache, size_t size)
-{
-	pgtable->pgtable = kmem_cache_zalloc(cache, GFP_ATOMIC);
-	if (!pgtable->pgtable)
-		return -ENOMEM;
-	pgtable->handle = dma_map_single(NULL, pgtable->pgtable, size,
-					 DMA_TO_DEVICE);
-	return 0;
-}
-
-static void pgtable_free(struct shmobile_iommu_domain_pgtable *pgtable,
-			 struct kmem_cache *cache, size_t size)
-{
-	dma_unmap_single(NULL, pgtable->handle, size, DMA_TO_DEVICE);
-	kmem_cache_free(cache, pgtable->pgtable);
-}
-
-static uint32_t pgtable_read(struct shmobile_iommu_domain_pgtable *pgtable,
-			     unsigned int index)
-{
-	return pgtable->pgtable[index];
-}
-
-static void pgtable_write(struct shmobile_iommu_domain_pgtable *pgtable,
-			  unsigned int index, unsigned int count, uint32_t val)
-{
-	unsigned int i;
-
-	for (i = 0; i < count; i++)
-		pgtable->pgtable[index + i] = val;
-	dma_sync_single_for_device(NULL, pgtable->handle + index * sizeof(val),
-				   sizeof(val) * count, DMA_TO_DEVICE);
-}
-
-static struct iommu_domain *shmobile_iommu_domain_alloc(unsigned type)
-{
-	struct shmobile_iommu_domain *sh_domain;
-	int i, ret;
-
-	if (type != IOMMU_DOMAIN_UNMANAGED)
-		return NULL;
-
-	sh_domain = kzalloc(sizeof(*sh_domain), GFP_KERNEL);
-	if (!sh_domain)
-		return NULL;
-	ret = pgtable_alloc(&sh_domain->l1, l1cache, L1_SIZE);
-	if (ret < 0) {
-		kfree(sh_domain);
-		return NULL;
-	}
-	for (i = 0; i < L1_LEN; i++)
-		sh_domain->l2[i].pgtable = NULL;
-	spin_lock_init(&sh_domain->map_lock);
-	spin_lock_init(&sh_domain->attached_list_lock);
-	INIT_LIST_HEAD(&sh_domain->attached_list);
-	return &sh_domain->domain;
-}
-
-static void shmobile_iommu_domain_free(struct iommu_domain *domain)
-{
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-	int i;
-
-	for (i = 0; i < L1_LEN; i++) {
-		if (sh_domain->l2[i].pgtable)
-			pgtable_free(&sh_domain->l2[i], l2cache, L2_SIZE);
-	}
-	pgtable_free(&sh_domain->l1, l1cache, L1_SIZE);
-	kfree(sh_domain);
-}
-
-static int shmobile_iommu_attach_device(struct iommu_domain *domain,
-					struct device *dev)
-{
-	struct shmobile_iommu_archdata *archdata = dev->archdata.iommu;
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-	int ret = -EBUSY;
-
-	if (!archdata)
-		return -ENODEV;
-	spin_lock(&sh_domain->attached_list_lock);
-	spin_lock(&archdata->attach_lock);
-	if (archdata->attached != sh_domain) {
-		if (archdata->attached)
-			goto err;
-		ipmmu_tlb_set(archdata->ipmmu, sh_domain->l1.handle, L1_SIZE,
-			      0);
-		ipmmu_tlb_flush(archdata->ipmmu);
-		archdata->attached = sh_domain;
-		archdata->num_attached_devices = 0;
-		list_add(&archdata->attached_list, &sh_domain->attached_list);
-	}
-	archdata->num_attached_devices++;
-	ret = 0;
-err:
-	spin_unlock(&archdata->attach_lock);
-	spin_unlock(&sh_domain->attached_list_lock);
-	return ret;
-}
-
-static void shmobile_iommu_detach_device(struct iommu_domain *domain,
-					 struct device *dev)
-{
-	struct shmobile_iommu_archdata *archdata = dev->archdata.iommu;
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-
-	if (!archdata)
-		return;
-	spin_lock(&sh_domain->attached_list_lock);
-	spin_lock(&archdata->attach_lock);
-	archdata->num_attached_devices--;
-	if (!archdata->num_attached_devices) {
-		ipmmu_tlb_set(archdata->ipmmu, 0, 0, 0);
-		ipmmu_tlb_flush(archdata->ipmmu);
-		archdata->attached = NULL;
-		list_del(&archdata->attached_list);
-	}
-	spin_unlock(&archdata->attach_lock);
-	spin_unlock(&sh_domain->attached_list_lock);
-}
-
-static void domain_tlb_flush(struct shmobile_iommu_domain *sh_domain)
-{
-	struct shmobile_iommu_archdata *archdata;
-
-	spin_lock(&sh_domain->attached_list_lock);
-	list_for_each_entry(archdata, &sh_domain->attached_list, attached_list)
-		ipmmu_tlb_flush(archdata->ipmmu);
-	spin_unlock(&sh_domain->attached_list_lock);
-}
-
-static int l2alloc(struct shmobile_iommu_domain *sh_domain,
-		   unsigned int l1index)
-{
-	int ret;
-
-	if (!sh_domain->l2[l1index].pgtable) {
-		ret = pgtable_alloc(&sh_domain->l2[l1index], l2cache, L2_SIZE);
-		if (ret < 0)
-			return ret;
-	}
-	pgtable_write(&sh_domain->l1, l1index, 1,
-		      sh_domain->l2[l1index].handle | 0x1);
-	return 0;
-}
-
-static void l2realfree(struct shmobile_iommu_domain_pgtable *l2)
-{
-	if (l2->pgtable)
-		pgtable_free(l2, l2cache, L2_SIZE);
-}
-
-static void l2free(struct shmobile_iommu_domain *sh_domain,
-		   unsigned int l1index,
-		   struct shmobile_iommu_domain_pgtable *l2)
-{
-	pgtable_write(&sh_domain->l1, l1index, 1, 0);
-	if (sh_domain->l2[l1index].pgtable) {
-		*l2 = sh_domain->l2[l1index];
-		sh_domain->l2[l1index].pgtable = NULL;
-	}
-}
-
-static int shmobile_iommu_map(struct iommu_domain *domain, unsigned long iova,
-			      phys_addr_t paddr, size_t size, int prot)
-{
-	struct shmobile_iommu_domain_pgtable l2 = { .pgtable = NULL };
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-	unsigned int l1index, l2index;
-	int ret;
-
-	l1index = iova >> 20;
-	switch (size) {
-	case SZ_4K:
-		l2index = (iova >> 12) & 0xff;
-		spin_lock(&sh_domain->map_lock);
-		ret = l2alloc(sh_domain, l1index);
-		if (!ret)
-			pgtable_write(&sh_domain->l2[l1index], l2index, 1,
-				      paddr | 0xff2);
-		spin_unlock(&sh_domain->map_lock);
-		break;
-	case SZ_64K:
-		l2index = (iova >> 12) & 0xf0;
-		spin_lock(&sh_domain->map_lock);
-		ret = l2alloc(sh_domain, l1index);
-		if (!ret)
-			pgtable_write(&sh_domain->l2[l1index], l2index, 0x10,
-				      paddr | 0xff1);
-		spin_unlock(&sh_domain->map_lock);
-		break;
-	case SZ_1M:
-		spin_lock(&sh_domain->map_lock);
-		l2free(sh_domain, l1index, &l2);
-		pgtable_write(&sh_domain->l1, l1index, 1, paddr | 0xc02);
-		spin_unlock(&sh_domain->map_lock);
-		ret = 0;
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	if (!ret)
-		domain_tlb_flush(sh_domain);
-	l2realfree(&l2);
-	return ret;
-}
-
-static size_t shmobile_iommu_unmap(struct iommu_domain *domain,
-				   unsigned long iova, size_t size)
-{
-	struct shmobile_iommu_domain_pgtable l2 = { .pgtable = NULL };
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-	unsigned int l1index, l2index;
-	uint32_t l2entry = 0;
-	size_t ret = 0;
-
-	l1index = iova >> 20;
-	if (!(iova & 0xfffff) && size >= SZ_1M) {
-		spin_lock(&sh_domain->map_lock);
-		l2free(sh_domain, l1index, &l2);
-		spin_unlock(&sh_domain->map_lock);
-		ret = SZ_1M;
-		goto done;
-	}
-	l2index = (iova >> 12) & 0xff;
-	spin_lock(&sh_domain->map_lock);
-	if (sh_domain->l2[l1index].pgtable)
-		l2entry = pgtable_read(&sh_domain->l2[l1index], l2index);
-	switch (l2entry & 3) {
-	case 1:
-		if (l2index & 0xf)
-			break;
-		pgtable_write(&sh_domain->l2[l1index], l2index, 0x10, 0);
-		ret = SZ_64K;
-		break;
-	case 2:
-		pgtable_write(&sh_domain->l2[l1index], l2index, 1, 0);
-		ret = SZ_4K;
-		break;
-	}
-	spin_unlock(&sh_domain->map_lock);
-done:
-	if (ret)
-		domain_tlb_flush(sh_domain);
-	l2realfree(&l2);
-	return ret;
-}
-
-static phys_addr_t shmobile_iommu_iova_to_phys(struct iommu_domain *domain,
-					       dma_addr_t iova)
-{
-	struct shmobile_iommu_domain *sh_domain = to_sh_domain(domain);
-	uint32_t l1entry = 0, l2entry = 0;
-	unsigned int l1index, l2index;
-
-	l1index = iova >> 20;
-	l2index = (iova >> 12) & 0xff;
-	spin_lock(&sh_domain->map_lock);
-	if (sh_domain->l2[l1index].pgtable)
-		l2entry = pgtable_read(&sh_domain->l2[l1index], l2index);
-	else
-		l1entry = pgtable_read(&sh_domain->l1, l1index);
-	spin_unlock(&sh_domain->map_lock);
-	switch (l2entry & 3) {
-	case 1:
-		return (l2entry & ~0xffff) | (iova & 0xffff);
-	case 2:
-		return (l2entry & ~0xfff) | (iova & 0xfff);
-	default:
-		if ((l1entry & 3) == 2)
-			return (l1entry & ~0xfffff) | (iova & 0xfffff);
-		return 0;
-	}
-}
-
-static int find_dev_name(struct shmobile_ipmmu *ipmmu, const char *dev_name)
-{
-	unsigned int i, n = ipmmu->num_dev_names;
-
-	for (i = 0; i < n; i++) {
-		if (strcmp(ipmmu->dev_names[i], dev_name) == 0)
-			return 1;
-	}
-	return 0;
-}
-
-static int shmobile_iommu_add_device(struct device *dev)
-{
-	struct shmobile_iommu_archdata *archdata = ipmmu_archdata;
-	struct dma_iommu_mapping *mapping;
-
-	if (!find_dev_name(archdata->ipmmu, dev_name(dev)))
-		return 0;
-	mapping = archdata->iommu_mapping;
-	if (!mapping) {
-		mapping = arm_iommu_create_mapping(&platform_bus_type, 0,
-						   L1_LEN << 20);
-		if (IS_ERR(mapping))
-			return PTR_ERR(mapping);
-		archdata->iommu_mapping = mapping;
-	}
-	dev->archdata.iommu = archdata;
-	if (arm_iommu_attach_device(dev, mapping))
-		pr_err("arm_iommu_attach_device failed\n");
-	return 0;
-}
-
-static const struct iommu_ops shmobile_iommu_ops = {
-	.domain_alloc = shmobile_iommu_domain_alloc,
-	.domain_free = shmobile_iommu_domain_free,
-	.attach_dev = shmobile_iommu_attach_device,
-	.detach_dev = shmobile_iommu_detach_device,
-	.map = shmobile_iommu_map,
-	.unmap = shmobile_iommu_unmap,
-	.map_sg = default_iommu_map_sg,
-	.iova_to_phys = shmobile_iommu_iova_to_phys,
-	.add_device = shmobile_iommu_add_device,
-	.pgsize_bitmap = SZ_1M | SZ_64K | SZ_4K,
-};
-
-int ipmmu_iommu_init(struct shmobile_ipmmu *ipmmu)
-{
-	static struct shmobile_iommu_archdata *archdata;
-
-	l1cache = kmem_cache_create("shmobile-iommu-pgtable1", L1_SIZE,
-				    L1_ALIGN, SLAB_HWCACHE_ALIGN, NULL);
-	if (!l1cache)
-		return -ENOMEM;
-	l2cache = kmem_cache_create("shmobile-iommu-pgtable2", L2_SIZE,
-				    L2_ALIGN, SLAB_HWCACHE_ALIGN, NULL);
-	if (!l2cache) {
-		kmem_cache_destroy(l1cache);
-		return -ENOMEM;
-	}
-	archdata = kzalloc(sizeof(*archdata), GFP_KERNEL);
-	if (!archdata) {
-		kmem_cache_destroy(l1cache);
-		kmem_cache_destroy(l2cache);
-		return -ENOMEM;
-	}
-	spin_lock_init(&archdata->attach_lock);
-	archdata->ipmmu = ipmmu;
-	ipmmu_archdata = archdata;
-	bus_set_iommu(&platform_bus_type, &shmobile_iommu_ops);
-	return 0;
-}
diff --git a/drivers/iommu/shmobile-ipmmu.c b/drivers/iommu/shmobile-ipmmu.c
deleted file mode 100644
index 951651a..0000000
--- a/drivers/iommu/shmobile-ipmmu.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * IPMMU/IPMMUI
- * Copyright (C) 2012  Hideki EIRAKU
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/platform_data/sh_ipmmu.h>
-#include "shmobile-ipmmu.h"
-
-#define IMCTR1 0x000
-#define IMCTR2 0x004
-#define IMASID 0x010
-#define IMTTBR 0x014
-#define IMTTBCR 0x018
-
-#define IMCTR1_TLBEN (1 << 0)
-#define IMCTR1_FLUSH (1 << 1)
-
-static void ipmmu_reg_write(struct shmobile_ipmmu *ipmmu, unsigned long reg_off,
-			    unsigned long data)
-{
-	iowrite32(data, ipmmu->ipmmu_base + reg_off);
-}
-
-void ipmmu_tlb_flush(struct shmobile_ipmmu *ipmmu)
-{
-	if (!ipmmu)
-		return;
-
-	spin_lock(&ipmmu->flush_lock);
-	if (ipmmu->tlb_enabled)
-		ipmmu_reg_write(ipmmu, IMCTR1, IMCTR1_FLUSH | IMCTR1_TLBEN);
-	else
-		ipmmu_reg_write(ipmmu, IMCTR1, IMCTR1_FLUSH);
-	spin_unlock(&ipmmu->flush_lock);
-}
-
-void ipmmu_tlb_set(struct shmobile_ipmmu *ipmmu, unsigned long phys, int size,
-		   int asid)
-{
-	if (!ipmmu)
-		return;
-
-	spin_lock(&ipmmu->flush_lock);
-	switch (size) {
-	default:
-		ipmmu->tlb_enabled = 0;
-		break;
-	case 0x2000:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 1);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x1000:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 2);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x800:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 3);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x400:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 4);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x200:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 5);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x100:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 6);
-		ipmmu->tlb_enabled = 1;
-		break;
-	case 0x80:
-		ipmmu_reg_write(ipmmu, IMTTBCR, 7);
-		ipmmu->tlb_enabled = 1;
-		break;
-	}
-	ipmmu_reg_write(ipmmu, IMTTBR, phys);
-	ipmmu_reg_write(ipmmu, IMASID, asid);
-	spin_unlock(&ipmmu->flush_lock);
-}
-
-static int ipmmu_probe(struct platform_device *pdev)
-{
-	struct shmobile_ipmmu *ipmmu;
-	struct resource *res;
-	struct shmobile_ipmmu_platform_data *pdata = pdev->dev.platform_data;
-
-	ipmmu = devm_kzalloc(&pdev->dev, sizeof(*ipmmu), GFP_KERNEL);
-	if (!ipmmu) {
-		dev_err(&pdev->dev, "cannot allocate device data\n");
-		return -ENOMEM;
-	}
-	spin_lock_init(&ipmmu->flush_lock);
-	ipmmu->dev = &pdev->dev;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	ipmmu->ipmmu_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(ipmmu->ipmmu_base))
-		return PTR_ERR(ipmmu->ipmmu_base);
-
-	ipmmu->dev_names = pdata->dev_names;
-	ipmmu->num_dev_names = pdata->num_dev_names;
-	platform_set_drvdata(pdev, ipmmu);
-	ipmmu_reg_write(ipmmu, IMCTR1, 0x0); /* disable TLB */
-	ipmmu_reg_write(ipmmu, IMCTR2, 0x0); /* disable PMB */
-	return ipmmu_iommu_init(ipmmu);
-}
-
-static struct platform_driver ipmmu_driver = {
-	.probe = ipmmu_probe,
-	.driver = {
-		.name = "ipmmu",
-	},
-};
-
-static int __init ipmmu_init(void)
-{
-	return platform_driver_register(&ipmmu_driver);
-}
-subsys_initcall(ipmmu_init);
diff --git a/drivers/iommu/shmobile-ipmmu.h b/drivers/iommu/shmobile-ipmmu.h
deleted file mode 100644
index 9524743..0000000
--- a/drivers/iommu/shmobile-ipmmu.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* shmobile-ipmmu.h
- *
- * Copyright (C) 2012  Hideki EIRAKU
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#ifndef __SHMOBILE_IPMMU_H__
-#define __SHMOBILE_IPMMU_H__
-
-struct shmobile_ipmmu {
-	struct device *dev;
-	void __iomem *ipmmu_base;
-	int tlb_enabled;
-	spinlock_t flush_lock;
-	const char * const *dev_names;
-	unsigned int num_dev_names;
-};
-
-#ifdef CONFIG_SHMOBILE_IPMMU_TLB
-void ipmmu_tlb_flush(struct shmobile_ipmmu *ipmmu);
-void ipmmu_tlb_set(struct shmobile_ipmmu *ipmmu, unsigned long phys, int size,
-		   int asid);
-int ipmmu_iommu_init(struct shmobile_ipmmu *ipmmu);
-#else
-static inline int ipmmu_iommu_init(struct shmobile_ipmmu *ipmmu)
-{
-	return -EINVAL;
-}
-#endif
-
-#endif /* __SHMOBILE_IPMMU_H__ */