libata: fix ATAPI DMA alignment issues

ATAPI needs to be padded to next 4 byte boundary, if misaligned.

Original work by me, many fixes from Tejun Heo.
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 540a851..79fdbba 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -449,14 +449,14 @@
 
 static void pdc20621_dma_prep(struct ata_queued_cmd *qc)
 {
-	struct scatterlist *sg = qc->sg;
+	struct scatterlist *sg;
 	struct ata_port *ap = qc->ap;
 	struct pdc_port_priv *pp = ap->private_data;
 	void __iomem *mmio = ap->host_set->mmio_base;
 	struct pdc_host_priv *hpriv = ap->host_set->private_data;
 	void __iomem *dimm_mmio = hpriv->dimm_mmio;
 	unsigned int portno = ap->port_no;
-	unsigned int i, last, idx, total_len = 0, sgt_len;
+	unsigned int i, idx, total_len = 0, sgt_len;
 	u32 *buf = (u32 *) &pp->dimm_buf[PDC_DIMM_HEADER_SZ];
 
 	assert(qc->flags & ATA_QCFLAG_DMAMAP);
@@ -469,12 +469,11 @@
 	/*
 	 * Build S/G table
 	 */
-	last = qc->n_elem;
 	idx = 0;
-	for (i = 0; i < last; i++) {
-		buf[idx++] = cpu_to_le32(sg_dma_address(&sg[i]));
-		buf[idx++] = cpu_to_le32(sg_dma_len(&sg[i]));
-		total_len += sg_dma_len(&sg[i]);
+	ata_for_each_sg(sg, qc) {
+		buf[idx++] = cpu_to_le32(sg_dma_address(sg));
+		buf[idx++] = cpu_to_le32(sg_dma_len(sg));
+		total_len += sg_dma_len(sg);
 	}
 	buf[idx - 1] |= cpu_to_le32(ATA_PRD_EOT);
 	sgt_len = idx * 4;