iommu/dma: Plumb in the per-CPU IOVA caches

With IOVA allocation suitably tidied up, we are finally free to opt in
to the per-CPU caching mechanism. The caching alone can provide a modest
improvement over walking the rbtree for weedier systems (iperf3 shows
~10% more ethernet throughput on an ARM Juno r1 constrained to a single
650MHz Cortex-A53), but the real gain will be in sidestepping the rbtree
lock contention which larger ARM-based systems with lots of parallel I/O
are starting to feel the pain of.

Reviewed-by: Nate Watterson <nwatters@codeaurora.org>
Tested-by: Nate Watterson <nwatters@codeaurora.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Git-commit: bb65a64c7285e7105c1a6c8a33b37770343a4e96
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
[pdaly@codeaurora.org: Resolve conflicts due to missing msi changes]

Change-Id: I8b0ef39e577e22b914fc10cfdcf5482b41bd6661
Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index eb920a4..3aa3546 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -208,32 +208,32 @@
 	struct iova_domain *iovad = cookie_iovad(domain);
 	unsigned long shift = iova_shift(iovad);
 	unsigned long iova_len = size >> shift;
-	struct iova *iova = NULL;
+	unsigned long iova = 0;
+
+	/*
+	 * Freeing non-power-of-two-sized allocations back into the IOVA caches
+	 * will come back to bite us badly, so we have to waste a bit of space
+	 * rounding up anything cacheable to make sure that can't happen. The
+	 * order of the unadjusted size will still match upon freeing.
+	 */
+	if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1)))
+		iova_len = roundup_pow_of_two(iova_len);
 
 	if (domain->geometry.force_aperture)
 		dma_limit = min(dma_limit, domain->geometry.aperture_end);
-	/*
-	 * Enforce size-alignment to be safe - there could perhaps be an
-	 * attribute to control this per-device, or at least per-domain...
-	 */
-	iova = alloc_iova(iovad, iova_len, dma_limit >> shift, true);
-	if (!iova)
-		return 0;
 
-	return (dma_addr_t)iova->pfn_lo << shift;
+	iova = alloc_iova_fast(iovad, iova_len, dma_limit >> shift);
+
+	return (dma_addr_t)iova << shift;
 }
 
 static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie,
 		dma_addr_t iova, size_t size)
 {
 	struct iova_domain *iovad = &cookie->iovad;
-	struct iova *iova_rbnode;
+	unsigned long shift = iova_shift(iovad);
 
-	iova_rbnode = find_iova(iovad, iova_pfn(iovad, iova));
-	if (WARN_ON(!iova_rbnode))
-		return;
-
-	__free_iova(iovad, iova_rbnode);
+	free_iova_fast(iovad, iova >> shift, size >> shift);
 }
 
 static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr,