drm/i915: Introduce & use new lightweight SGL iterators

The existing for_each_sg_page() iterator is somewhat heavyweight, and is
limiting i915 driver performance in a few benchmarks. So here we
introduce somewhat lighter weight iterators, primarily for use with GEM
objects or other case where we need only deal with whole aligned pages.

Unlike the old iterator, the new iterators use an internal state
structure which is not intended to be accessed by the caller; instead
each takes as a parameter an output variable which is set before each
iteration. This makes them particularly simple to use :)

One of the new iterators provides the caller with the DMA address of
each page in turn; the other provides the 'struct page' pointer required
by many memory management operations.

Various uses of for_each_sg_page() are then converted to the new macros.

v2: Force inlining of the sg_iter constructor and make the union
anonymous.

Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1463741647-15666-4-git-send-email-chris@chris-wilson.co.uk
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 7eab619..4668477 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1839,20 +1839,19 @@
 				      enum i915_cache_level cache_level, u32 flags)
 {
 	struct i915_hw_ppgtt *ppgtt = i915_vm_to_ppgtt(vm);
-	gen6_pte_t *pt_vaddr;
 	unsigned first_entry = start >> PAGE_SHIFT;
 	unsigned act_pt = first_entry / GEN6_PTES;
 	unsigned act_pte = first_entry % GEN6_PTES;
-	struct sg_page_iter sg_iter;
+	gen6_pte_t *pt_vaddr = NULL;
+	struct sgt_iter sgt_iter;
+	dma_addr_t addr;
 
-	pt_vaddr = NULL;
-	for_each_sg_page(pages->sgl, &sg_iter, pages->nents, 0) {
+	for_each_sgt_dma(addr, sgt_iter, pages) {
 		if (pt_vaddr == NULL)
 			pt_vaddr = kmap_px(ppgtt->pd.page_table[act_pt]);
 
 		pt_vaddr[act_pte] =
-			vm->pte_encode(sg_page_iter_dma_address(&sg_iter),
-				       cache_level, true, flags);
+			vm->pte_encode(addr, cache_level, true, flags);
 
 		if (++act_pte == GEN6_PTES) {
 			kunmap_px(ppgtt, pt_vaddr);
@@ -1861,6 +1860,7 @@
 			act_pte = 0;
 		}
 	}
+
 	if (pt_vaddr)
 		kunmap_px(ppgtt, pt_vaddr);
 }
@@ -2362,22 +2362,20 @@
 {
 	struct drm_i915_private *dev_priv = to_i915(vm->dev);
 	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
-	unsigned first_entry = start >> PAGE_SHIFT;
-	gen8_pte_t __iomem *gtt_entries =
-		(gen8_pte_t __iomem *)ggtt->gsm + first_entry;
-	int i = 0;
-	struct sg_page_iter sg_iter;
-	dma_addr_t addr = 0; /* shut up gcc */
+	struct sgt_iter sgt_iter;
+	gen8_pte_t __iomem *gtt_entries;
+	gen8_pte_t gtt_entry;
+	dma_addr_t addr;
 	int rpm_atomic_seq;
+	int i = 0;
 
 	rpm_atomic_seq = assert_rpm_atomic_begin(dev_priv);
 
-	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) {
-		addr = sg_dma_address(sg_iter.sg) +
-			(sg_iter.sg_pgoffset << PAGE_SHIFT);
-		gen8_set_pte(&gtt_entries[i],
-			     gen8_pte_encode(addr, level, true));
-		i++;
+	gtt_entries = (gen8_pte_t __iomem *)ggtt->gsm + (start >> PAGE_SHIFT);
+
+	for_each_sgt_dma(addr, sgt_iter, st) {
+		gtt_entry = gen8_pte_encode(addr, level, true);
+		gen8_set_pte(&gtt_entries[i++], gtt_entry);
 	}
 
 	/*
@@ -2388,8 +2386,7 @@
 	 * hardware should work, we must keep this posting read for paranoia.
 	 */
 	if (i != 0)
-		WARN_ON(readq(&gtt_entries[i-1])
-			!= gen8_pte_encode(addr, level, true));
+		WARN_ON(readq(&gtt_entries[i-1]) != gtt_entry);
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
@@ -2440,20 +2437,20 @@
 {
 	struct drm_i915_private *dev_priv = to_i915(vm->dev);
 	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm);
-	unsigned first_entry = start >> PAGE_SHIFT;
-	gen6_pte_t __iomem *gtt_entries =
-		(gen6_pte_t __iomem *)ggtt->gsm + first_entry;
-	int i = 0;
-	struct sg_page_iter sg_iter;
-	dma_addr_t addr = 0;
+	struct sgt_iter sgt_iter;
+	gen6_pte_t __iomem *gtt_entries;
+	gen6_pte_t gtt_entry;
+	dma_addr_t addr;
 	int rpm_atomic_seq;
+	int i = 0;
 
 	rpm_atomic_seq = assert_rpm_atomic_begin(dev_priv);
 
-	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) {
-		addr = sg_page_iter_dma_address(&sg_iter);
-		iowrite32(vm->pte_encode(addr, level, true, flags), &gtt_entries[i]);
-		i++;
+	gtt_entries = (gen6_pte_t __iomem *)ggtt->gsm + (start >> PAGE_SHIFT);
+
+	for_each_sgt_dma(addr, sgt_iter, st) {
+		gtt_entry = vm->pte_encode(addr, level, true, flags);
+		iowrite32(gtt_entry, &gtt_entries[i++]);
 	}
 
 	/* XXX: This serves as a posting read to make sure that the PTE has
@@ -2462,10 +2459,8 @@
 	 * of NUMA access patterns. Therefore, even with the way we assume
 	 * hardware should work, we must keep this posting read for paranoia.
 	 */
-	if (i != 0) {
-		unsigned long gtt = readl(&gtt_entries[i-1]);
-		WARN_ON(gtt != vm->pte_encode(addr, level, true, flags));
-	}
+	if (i != 0)
+		WARN_ON(readl(&gtt_entries[i-1]) != gtt_entry);
 
 	/* This next bit makes the above posting read even more important. We
 	 * want to flush the TLBs only after we're certain all the PTE updates
@@ -3399,9 +3394,11 @@
 intel_rotate_fb_obj_pages(struct intel_rotation_info *rot_info,
 			  struct drm_i915_gem_object *obj)
 {
+	const size_t n_pages = obj->base.size / PAGE_SIZE;
 	unsigned int size_pages = rot_info->plane[0].width * rot_info->plane[0].height;
 	unsigned int size_pages_uv;
-	struct sg_page_iter sg_iter;
+	struct sgt_iter sgt_iter;
+	dma_addr_t dma_addr;
 	unsigned long i;
 	dma_addr_t *page_addr_list;
 	struct sg_table *st;
@@ -3410,7 +3407,7 @@
 	int ret = -ENOMEM;
 
 	/* Allocate a temporary list of source pages for random access. */
-	page_addr_list = drm_malloc_gfp(obj->base.size / PAGE_SIZE,
+	page_addr_list = drm_malloc_gfp(n_pages,
 					sizeof(dma_addr_t),
 					GFP_TEMPORARY);
 	if (!page_addr_list)
@@ -3433,11 +3430,10 @@
 
 	/* Populate source page list from the object. */
 	i = 0;
-	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		page_addr_list[i] = sg_page_iter_dma_address(&sg_iter);
-		i++;
-	}
+	for_each_sgt_dma(dma_addr, sgt_iter, obj->pages)
+		page_addr_list[i++] = dma_addr;
 
+	GEM_BUG_ON(i != n_pages);
 	st->nents = 0;
 	sg = st->sgl;