arm64: Fix DMA range invalidation for cache line unaligned buffers

If the buffer needing cache invalidation for inbound DMA does start or
end on a cache line aligned address, we need to use the non-destructive
clean&invalidate operation. This issue was introduced by commit
7363590d2c46 (arm64: Implement coherent DMA API based on swiotlb).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Jon Medhurst (Tixy) <tixy@linaro.org>
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index e803a62..fda7568 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -183,12 +183,19 @@
 __dma_inv_range:
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
-	bic	x0, x0, x3
+	tst	x1, x3				// end cache line aligned?
 	bic	x1, x1, x3
-1:	dc	ivac, x0			// invalidate D / U line
-	add	x0, x0, x2
+	b.eq	1f
+	dc	civac, x1			// clean & invalidate D / U line
+1:	tst	x0, x3				// start cache line aligned?
+	bic	x0, x0, x3
+	b.eq	2f
+	dc	civac, x0			// clean & invalidate D / U line
+	b	3f
+2:	dc	ivac, x0			// invalidate D / U line
+3:	add	x0, x0, x2
 	cmp	x0, x1
-	b.lo	1b
+	b.lo	2b
 	dsb	sy
 	ret
 ENDPROC(__inval_cache_range)