firewire: ohci: avoid separate DMA mapping for small AT payloads

For AT packet payloads of up to eight bytes, we have enough unused space
in the DMA descriptors list so that we can put a copy of the payload
there and thus avoid having to create a separate streaming DMA mapping
for the payload buffer.

In a CPU-bound microbenchmark that just sends 8-byte packets, bandwidth
was measured to increase by 5.7 %, from 1009 KB/s to 1067 KB/s.  In
practice, the only performance-sensitive usage of small asynchronous
packets is the SBP-2 driver's write to the ORB_POINTER register during
SCSI command submission.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index dfb3cb7..435ed80 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -1218,6 +1218,7 @@
 }
 
 struct driver_data {
+	u8 inline_data[8];
 	struct fw_packet *packet;
 };
 
@@ -1301,20 +1302,28 @@
 		return -1;
 	}
 
+	BUILD_BUG_ON(sizeof(struct driver_data) > sizeof(struct descriptor));
 	driver_data = (struct driver_data *) &d[3];
 	driver_data->packet = packet;
 	packet->driver_data = driver_data;
 
 	if (packet->payload_length > 0) {
-		payload_bus =
-			dma_map_single(ohci->card.device, packet->payload,
-				       packet->payload_length, DMA_TO_DEVICE);
-		if (dma_mapping_error(ohci->card.device, payload_bus)) {
-			packet->ack = RCODE_SEND_ERROR;
-			return -1;
+		if (packet->payload_length > sizeof(driver_data->inline_data)) {
+			payload_bus = dma_map_single(ohci->card.device,
+						     packet->payload,
+						     packet->payload_length,
+						     DMA_TO_DEVICE);
+			if (dma_mapping_error(ohci->card.device, payload_bus)) {
+				packet->ack = RCODE_SEND_ERROR;
+				return -1;
+			}
+			packet->payload_bus	= payload_bus;
+			packet->payload_mapped	= true;
+		} else {
+			memcpy(driver_data->inline_data, packet->payload,
+			       packet->payload_length);
+			payload_bus = d_bus + 3 * sizeof(*d);
 		}
-		packet->payload_bus	= payload_bus;
-		packet->payload_mapped	= true;
 
 		d[2].req_count    = cpu_to_le16(packet->payload_length);
 		d[2].data_address = cpu_to_le32(payload_bus);