Update from upstream with manual merge of Yasunori Goto's
changes to swiotlb.c made in commit 281dd25cdc0d6903929b79183816d151ea626341
since this file has been moved from arch/ia64/lib/swiotlb.c to
lib/swiotlb.c

Signed-off-by: Tony Luck <tony.luck@intel.com>
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 945c15a..8f699a2 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -26,6 +26,10 @@
 	bool
 	default y
 
+config SWIOTLB
+       bool
+       default y
+
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 80f8ef0..317c334 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -17,7 +17,7 @@
 #include <asm/machvec.h>
 
 /* swiotlb declarations & definitions: */
-extern void swiotlb_init_with_default_size (size_t size);
+extern int swiotlb_late_init_with_default_size (size_t size);
 extern ia64_mv_dma_alloc_coherent	swiotlb_alloc_coherent;
 extern ia64_mv_dma_free_coherent	swiotlb_free_coherent;
 extern ia64_mv_dma_map_single		swiotlb_map_single;
@@ -67,7 +67,16 @@
 hwsw_init (void)
 {
 	/* default to a smallish 2MB sw I/O TLB */
-	swiotlb_init_with_default_size (2 * (1<<20));
+	if (swiotlb_late_init_with_default_size (2 * (1<<20)) != 0) {
+#ifdef CONFIG_IA64_GENERIC
+		/* Better to have normal DMA than panic */
+		printk(KERN_WARNING "%s: Failed to initialize software I/O TLB,"
+		       " reverting to hpzx1 platform vector\n", __FUNCTION__);
+		machvec_init("hpzx1");
+#else
+		panic("Unable to initialize software I/O TLB services");
+#endif
+	}
 }
 
 void *
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 1195759..e64ca04 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2028,10 +2028,41 @@
 static int __init
 sba_init(void)
 {
-	acpi_bus_register_driver(&acpi_sba_ioc_driver);
-	if (!ioc_list)
+	if (!ia64_platform_is("hpzx1") && !ia64_platform_is("hpzx1_swiotlb"))
 		return 0;
 
+	acpi_bus_register_driver(&acpi_sba_ioc_driver);
+	if (!ioc_list) {
+#ifdef CONFIG_IA64_GENERIC
+		extern int swiotlb_late_init_with_default_size (size_t size);
+
+		/*
+		 * If we didn't find something sba_iommu can claim, we
+		 * need to setup the swiotlb and switch to the dig machvec.
+		 */
+		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
+			panic("Unable to find SBA IOMMU or initialize "
+			      "software I/O TLB: Try machvec=dig boot option");
+		machvec_init("dig");
+#else
+		panic("Unable to find SBA IOMMU: Try a generic or DIG kernel");
+#endif
+		return 0;
+	}
+
+#if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_HP_ZX1_SWIOTLB)
+	/*
+	 * hpzx1_swiotlb needs to have a fairly small swiotlb bounce
+	 * buffer setup to support devices with smaller DMA masks than
+	 * sba_iommu can handle.
+	 */
+	if (ia64_platform_is("hpzx1_swiotlb")) {
+		extern void hwsw_init(void);
+
+		hwsw_init();
+	}
+#endif
+
 #ifdef CONFIG_PCI
 	{
 		struct pci_bus *b = NULL;
@@ -2048,18 +2079,6 @@
 
 subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
 
-extern void dig_setup(char**);
-/*
- * MAX_DMA_ADDRESS needs to be setup prior to paging_init to do any good,
- * so we use the platform_setup hook to fix it up.
- */
-void __init
-sba_setup(char **cmdline_p)
-{
-	MAX_DMA_ADDRESS = ~0UL;
-	dig_setup(cmdline_p);
-}
-
 static int __init
 nosbagart(char *str)
 {
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index cb1af59..ac64664 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -9,7 +9,7 @@
 	bitop.o checksum.o clear_page.o csum_partial_copy.o		\
 	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
 	flush.o ip_fast_csum.o do_csum.o				\
-	memset.o strlen.o swiotlb.o
+	memset.o strlen.o
 
 lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
 lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index bcdd0a8..14328ca 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -27,7 +27,6 @@
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
-obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 
@@ -41,7 +40,6 @@
 bootflag-y			+= ../../i386/kernel/bootflag.o
 cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
 topology-y                     += ../../i386/mach-default/topology.o
-swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
 intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
diff --git a/include/asm-ia64/machvec_hpzx1.h b/include/asm-ia64/machvec_hpzx1.h
index daafe50..e90daf9 100644
--- a/include/asm-ia64/machvec_hpzx1.h
+++ b/include/asm-ia64/machvec_hpzx1.h
@@ -1,8 +1,7 @@
 #ifndef _ASM_IA64_MACHVEC_HPZX1_h
 #define _ASM_IA64_MACHVEC_HPZX1_h
 
-extern ia64_mv_setup_t dig_setup;
-extern ia64_mv_setup_t			sba_setup;
+extern ia64_mv_setup_t			dig_setup;
 extern ia64_mv_dma_alloc_coherent	sba_alloc_coherent;
 extern ia64_mv_dma_free_coherent	sba_free_coherent;
 extern ia64_mv_dma_map_single		sba_map_single;
@@ -19,15 +18,15 @@
  * platform's machvec structure.  When compiling a non-generic kernel,
  * the macros are used directly.
  */
-#define platform_name			"hpzx1"
-#define platform_setup			sba_setup
-#define platform_dma_init		machvec_noop
-#define platform_dma_alloc_coherent	sba_alloc_coherent
-#define platform_dma_free_coherent	sba_free_coherent
-#define platform_dma_map_single		sba_map_single
-#define platform_dma_unmap_single	sba_unmap_single
-#define platform_dma_map_sg		sba_map_sg
-#define platform_dma_unmap_sg		sba_unmap_sg
+#define platform_name				"hpzx1"
+#define platform_setup				dig_setup
+#define platform_dma_init			machvec_noop
+#define platform_dma_alloc_coherent		sba_alloc_coherent
+#define platform_dma_free_coherent		sba_free_coherent
+#define platform_dma_map_single			sba_map_single
+#define platform_dma_unmap_single		sba_unmap_single
+#define platform_dma_map_sg			sba_map_sg
+#define platform_dma_unmap_sg			sba_unmap_sg
 #define platform_dma_sync_single_for_cpu	machvec_dma_sync_single
 #define platform_dma_sync_sg_for_cpu		machvec_dma_sync_sg
 #define platform_dma_sync_single_for_device	machvec_dma_sync_single
diff --git a/include/asm-ia64/machvec_hpzx1_swiotlb.h b/include/asm-ia64/machvec_hpzx1_swiotlb.h
index 9924b1b..f00a34a 100644
--- a/include/asm-ia64/machvec_hpzx1_swiotlb.h
+++ b/include/asm-ia64/machvec_hpzx1_swiotlb.h
@@ -2,7 +2,6 @@
 #define _ASM_IA64_MACHVEC_HPZX1_SWIOTLB_h
 
 extern ia64_mv_setup_t				dig_setup;
-extern ia64_mv_dma_init				hwsw_init;
 extern ia64_mv_dma_alloc_coherent		hwsw_alloc_coherent;
 extern ia64_mv_dma_free_coherent		hwsw_free_coherent;
 extern ia64_mv_dma_map_single			hwsw_map_single;
@@ -26,7 +25,7 @@
 #define platform_name				"hpzx1_swiotlb"
 
 #define platform_setup				dig_setup
-#define platform_dma_init			hwsw_init
+#define platform_dma_init			machvec_noop
 #define platform_dma_alloc_coherent		hwsw_alloc_coherent
 #define platform_dma_free_coherent		hwsw_free_coherent
 #define platform_dma_map_single			hwsw_map_single
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index e784fdc..45ca88c 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -85,10 +85,33 @@
 	flush_write_buffers();
 }
 
-#define dma_sync_single_range_for_cpu(dev, dma_handle, offset, size, dir)       \
-        dma_sync_single_for_cpu(dev, dma_handle, size, dir)
-#define dma_sync_single_range_for_device(dev, dma_handle, offset, size, dir)    \
-        dma_sync_single_for_device(dev, dma_handle, size, dir)
+static inline void dma_sync_single_range_for_cpu(struct device *hwdev,
+						 dma_addr_t dma_handle,
+						 unsigned long offset,
+						 size_t size, int direction)
+{
+	if (direction == DMA_NONE)
+		out_of_line_bug();
+
+	if (swiotlb)
+		return swiotlb_sync_single_range_for_cpu(hwdev,dma_handle,offset,size,direction);
+
+	flush_write_buffers();
+}
+
+static inline void dma_sync_single_range_for_device(struct device *hwdev,
+						    dma_addr_t dma_handle,
+						    unsigned long offset,
+						    size_t size, int direction)
+{
+        if (direction == DMA_NONE)
+		out_of_line_bug();
+
+	if (swiotlb)
+		return swiotlb_sync_single_range_for_device(hwdev,dma_handle,offset,size,direction);
+
+	flush_write_buffers();
+}
 
 static inline void dma_sync_sg_for_cpu(struct device *hwdev,
 				       struct scatterlist *sg,
diff --git a/include/asm-x86_64/swiotlb.h b/include/asm-x86_64/swiotlb.h
index 3629306..9b011dd 100644
--- a/include/asm-x86_64/swiotlb.h
+++ b/include/asm-x86_64/swiotlb.h
@@ -15,6 +15,14 @@
 extern void swiotlb_sync_single_for_device(struct device *hwdev,
 					    dma_addr_t dev_addr,
 					    size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+					      dma_addr_t dev_addr,
+					      unsigned long offset,
+					      size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+						 dma_addr_t dev_addr,
+						 unsigned long offset,
+						 size_t size, int dir);
 extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
 				     struct scatterlist *sg, int nelems,
 				     int dir);
diff --git a/lib/Makefile b/lib/Makefile
index 44a4675..8535f4d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -44,6 +44,8 @@
 obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
+obj-$(CONFIG_SWIOTLB) += swiotlb.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/arch/ia64/lib/swiotlb.c b/lib/swiotlb.c
similarity index 73%
rename from arch/ia64/lib/swiotlb.c
rename to lib/swiotlb.c
index a604efc..5bdeaae 100644
--- a/arch/ia64/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -1,7 +1,7 @@
 /*
  * Dynamic DMA mapping support.
  *
- * This implementation is for IA-64 platforms that do not support
+ * This implementation is for IA-64 and EM64T platforms that do not support
  * I/O TLBs (aka DMA address translation hardware).
  * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
  * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
@@ -11,21 +11,23 @@
  * 03/05/07 davidm	Switch from PCI-DMA to generic device DMA API.
  * 00/12/13 davidm	Rename to swiotlb.c and add mark_clean() to avoid
  *			unnecessary i-cache flushing.
- * 04/07/.. ak          Better overflow handling. Assorted fixes.
+ * 04/07/.. ak		Better overflow handling. Assorted fixes.
+ * 05/09/10 linville	Add support for syncing ranges, support syncing for
+ *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
  */
 
 #include <linux/cache.h>
+#include <linux/dma-mapping.h>
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/pci.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ctype.h>
 
 #include <asm/io.h>
-#include <asm/pci.h>
 #include <asm/dma.h>
+#include <asm/scatterlist.h>
 
 #include <linux/init.h>
 #include <linux/bootmem.h>
@@ -49,6 +51,23 @@
  */
 #define IO_TLB_SHIFT 11
 
+#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+
+/*
+ * Minimum IO TLB size to bother booting with.  Systems with mainly
+ * 64bit capable cards will only lightly use the swiotlb.  If we can't
+ * allocate a contiguous 1MB, we're probably in trouble anyway.
+ */
+#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+
+/*
+ * Enumeration for sync targets
+ */
+enum dma_sync_target {
+	SYNC_FOR_CPU = 0,
+	SYNC_FOR_DEVICE = 1,
+};
+
 int swiotlb_force;
 
 /*
@@ -108,7 +127,7 @@
 
 /*
  * Statically reserve bounce buffer space and initialize bounce buffer data
- * structures for the software IO TLB used to implement the PCI DMA API.
+ * structures for the software IO TLB used to implement the DMA API.
  */
 void
 swiotlb_init_with_default_size (size_t default_size)
@@ -154,6 +173,99 @@
 	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
 }
 
+/*
+ * Systems with larger DMA zones (those that don't support ISA) can
+ * initialize the swiotlb later using the slab allocator if needed.
+ * This should be just like above, but with some error catching.
+ */
+int
+swiotlb_late_init_with_default_size (size_t default_size)
+{
+	unsigned long i, req_nslabs = io_tlb_nslabs;
+	unsigned int order;
+
+	if (!io_tlb_nslabs) {
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+	}
+
+	/*
+	 * Get IO TLB memory from the low pages
+	 */
+	order = get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+	io_tlb_nslabs = SLABS_PER_PAGE << order;
+
+	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+		io_tlb_start = (char *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+		                                        order);
+		if (io_tlb_start)
+			break;
+		order--;
+	}
+
+	if (!io_tlb_start)
+		goto cleanup1;
+
+	if (order != get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT))) {
+		printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+		io_tlb_nslabs = SLABS_PER_PAGE << order;
+	}
+	io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
+	memset(io_tlb_start, 0, io_tlb_nslabs * (1 << IO_TLB_SHIFT));
+
+	/*
+	 * Allocate and initialize the free list array.  This array is used
+	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+	 * between io_tlb_start and io_tlb_end.
+	 */
+	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+	                              get_order(io_tlb_nslabs * sizeof(int)));
+	if (!io_tlb_list)
+		goto cleanup2;
+
+	for (i = 0; i < io_tlb_nslabs; i++)
+ 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+	io_tlb_index = 0;
+
+	io_tlb_orig_addr = (unsigned char **)__get_free_pages(GFP_KERNEL,
+	                           get_order(io_tlb_nslabs * sizeof(char *)));
+	if (!io_tlb_orig_addr)
+		goto cleanup3;
+
+	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(char *));
+
+	/*
+	 * Get the overflow emergency buffer
+	 */
+	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+	                                          get_order(io_tlb_overflow));
+	if (!io_tlb_overflow_buffer)
+		goto cleanup4;
+
+	printk(KERN_INFO "Placing %ldMB software IO TLB between 0x%lx - "
+	       "0x%lx\n", (io_tlb_nslabs * (1 << IO_TLB_SHIFT)) >> 20,
+	       virt_to_phys(io_tlb_start), virt_to_phys(io_tlb_end));
+
+	return 0;
+
+cleanup4:
+	free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs *
+	                                                      sizeof(char *)));
+	io_tlb_orig_addr = NULL;
+cleanup3:
+	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+	                                                 sizeof(int)));
+	io_tlb_list = NULL;
+	io_tlb_end = NULL;
+cleanup2:
+	free_pages((unsigned long)io_tlb_start, order);
+	io_tlb_start = NULL;
+cleanup1:
+	io_tlb_nslabs = req_nslabs;
+	return -ENOMEM;
+}
+
 static inline int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr)
 {
@@ -295,21 +407,28 @@
 }
 
 static void
-sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+sync_single(struct device *hwdev, char *dma_addr, size_t size,
+	    int dir, int target)
 {
 	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
 	char *buffer = io_tlb_orig_addr[index];
 
-	/*
-	 * bounce... copy the data back into/from the original buffer
-	 * XXX How do you handle DMA_BIDIRECTIONAL here ?
-	 */
-	if (dir == DMA_FROM_DEVICE)
-		memcpy(buffer, dma_addr, size);
-	else if (dir == DMA_TO_DEVICE)
-		memcpy(dma_addr, buffer, size);
-	else
+	switch (target) {
+	case SYNC_FOR_CPU:
+		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(buffer, dma_addr, size);
+		else if (dir != DMA_TO_DEVICE)
+			BUG();
+		break;
+	case SYNC_FOR_DEVICE:
+		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+			memcpy(dma_addr, buffer, size);
+		else if (dir != DMA_FROM_DEVICE)
+			BUG();
+		break;
+	default:
 		BUG();
+	}
 }
 
 void *
@@ -383,24 +502,24 @@
 	/*
 	 * Ran out of IOMMU space for this operation. This is very bad.
 	 * Unfortunately the drivers cannot handle this operation properly.
-	 * unless they check for pci_dma_mapping_error (most don't)
+	 * unless they check for dma_mapping_error (most don't)
 	 * When the mapping is small enough return a static buffer to limit
 	 * the damage, or panic when the transfer is too big.
 	 */
-	printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at "
+	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %lu bytes at "
 	       "device %s\n", size, dev ? dev->bus_id : "?");
 
 	if (size > io_tlb_overflow && do_panic) {
-		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
-			panic("PCI-DMA: Memory would be corrupted\n");
-		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
-			panic("PCI-DMA: Random memory would be DMAed\n");
+		if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Memory would be corrupted\n");
+		if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+			panic("DMA: Random memory would be DMAed\n");
 	}
 }
 
 /*
  * Map a single buffer of the indicated size for DMA in streaming mode.  The
- * PCI address to use is returned.
+ * physical address to use is returned.
  *
  * Once the device is given the dma address, the device owns this memory until
  * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
@@ -487,39 +606,73 @@
  * after a transfer.
  *
  * If you perform a swiotlb_map_single() but wish to interrogate the buffer
- * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
- * call this function before doing so.  At the next point you give the PCI dma
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so.  At the next point you give the dma
  * address back to the card, you must first perform a
  * swiotlb_dma_sync_for_device, and then the device again owns the buffer
  */
-void
-swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
-			    size_t size, int dir)
+static inline void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+		    size_t size, int dir, int target)
 {
 	char *dma_addr = phys_to_virt(dev_addr);
 
 	if (dir == DMA_NONE)
 		BUG();
 	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
-		sync_single(hwdev, dma_addr, size, dir);
+		sync_single(hwdev, dma_addr, size, dir, target);
 	else if (dir == DMA_FROM_DEVICE)
 		mark_clean(dma_addr, size);
 }
 
 void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+			    size_t size, int dir)
+{
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+
+void
 swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
 			       size_t size, int dir)
 {
-	char *dma_addr = phys_to_virt(dev_addr);
+	swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+
+/*
+ * Same as above, but for a sub-range of the mapping.
+ */
+static inline void
+swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
+			  unsigned long offset, size_t size,
+			  int dir, int target)
+{
+	char *dma_addr = phys_to_virt(dev_addr) + offset;
 
 	if (dir == DMA_NONE)
 		BUG();
 	if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end)
-		sync_single(hwdev, dma_addr, size, dir);
+		sync_single(hwdev, dma_addr, size, dir, target);
 	else if (dir == DMA_FROM_DEVICE)
 		mark_clean(dma_addr, size);
 }
 
+void
+swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+				  unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_CPU);
+}
+
+void
+swiotlb_sync_single_range_for_device(struct device *hwdev, dma_addr_t dev_addr,
+				     unsigned long offset, size_t size, int dir)
+{
+	swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
+				  SYNC_FOR_DEVICE);
+}
+
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
  * This is the scatter-gather version of the above swiotlb_map_single
@@ -594,9 +747,9 @@
  * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
  * and usage.
  */
-void
-swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
-			int nelems, int dir)
+static inline void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sg,
+		int nelems, int dir, int target)
 {
 	int i;
 
@@ -606,22 +759,21 @@
 	for (i = 0; i < nelems; i++, sg++)
 		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
 			sync_single(hwdev, (void *) sg->dma_address,
-				    sg->dma_length, dir);
+				    sg->dma_length, dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+			int nelems, int dir)
+{
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
 }
 
 void
 swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
 			   int nelems, int dir)
 {
-	int i;
-
-	if (dir == DMA_NONE)
-		BUG();
-
-	for (i = 0; i < nelems; i++, sg++)
-		if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg))
-			sync_single(hwdev, (void *) sg->dma_address,
-				    sg->dma_length, dir);
+	swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
 }
 
 int
@@ -631,9 +783,9 @@
 }
 
 /*
- * Return whether the given PCI device DMA address mask can be supported
+ * Return whether the given device DMA address mask can be supported
  * properly.  For example, if your device can only drive the low 24-bits
- * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
  * this function.
  */
 int
@@ -649,6 +801,8 @@
 EXPORT_SYMBOL(swiotlb_unmap_sg);
 EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
 EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_cpu);
+EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
 EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);