ioat2,3: cacheline align software descriptor allocations

All the necessary fields for handling an ioat2,3 ring entry can fit into
one cacheline.  Move ->len prior to ->txd in struct ioat_ring_ent, and
move allocation of these entries to a hw-cache-aligned kmem cache to
reduce the number of cachelines dirtied for descriptor management.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index 460b773..fa3d6db 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -399,11 +399,12 @@
 		return NULL;
 	memset(hw, 0, sizeof(*hw));
 
-	desc = kzalloc(sizeof(*desc), flags);
+	desc = kmem_cache_alloc(ioat2_cache, flags);
 	if (!desc) {
 		pci_pool_free(dma->dma_pool, hw, phys);
 		return NULL;
 	}
+	memset(desc, 0, sizeof(*desc));
 
 	dma_async_tx_descriptor_init(&desc->txd, chan);
 	desc->txd.tx_submit = ioat2_tx_submit_unlock;
@@ -418,7 +419,7 @@
 
 	dma = to_ioatdma_device(chan->device);
 	pci_pool_free(dma->dma_pool, desc->hw, desc->txd.phys);
-	kfree(desc);
+	kmem_cache_free(ioat2_cache, desc);
 }
 
 static struct ioat_ring_ent **ioat2_alloc_ring(struct dma_chan *c, int order, gfp_t flags)