iwlwifi: use coherent DMA memory for command header

Recently in commit 8a964f44e01ad3bbc208c3e80d931ba91b9ea786
("iwlwifi: always copy first 16 bytes of commands") we fixed
the problem that the hardware writes back to the command and
that could overwrite parts of the data that was still needed
and would thus be corrupted.

Investigating this problem more closely we found that this
write-back isn't really ordered very well with respect to
other DMA traffic. Therefore, it sometimes happened that the
write-back occurred after unmapping the command again which
is clearly an issue and could corrupt the next allocation
that goes to that spot, or (better) cause IOMMU faults.

To fix this, allocate coherent memory for the first 16 bytes
of each command, containing the write-back part, and use it
for all queues. All the dynamic DMA mappings only need to be
TO_DEVICE then. This ensures that even when the write-back
happens "too late" it can't hit memory that has been freed
or a mapping that doesn't exist any more.

Since now the actual command is no longer modified, we can
also remove CMD_WANT_HCMD and get rid of the DMA sync that
was necessary to update the scratch pointer.

Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
diff --git a/drivers/net/wireless/iwlwifi/pcie/internal.h b/drivers/net/wireless/iwlwifi/pcie/internal.h
index 3d62e80..148843e7 100644
--- a/drivers/net/wireless/iwlwifi/pcie/internal.h
+++ b/drivers/net/wireless/iwlwifi/pcie/internal.h
@@ -137,10 +137,6 @@
 struct iwl_cmd_meta {
 	/* only for SYNC commands, iff the reply skb is wanted */
 	struct iwl_host_cmd *source;
-
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-	DEFINE_DMA_UNMAP_LEN(len);
-
 	u32 flags;
 };
 
@@ -185,25 +181,36 @@
 /*
  * The FH will write back to the first TB only, so we need
  * to copy some data into the buffer regardless of whether
- * it should be mapped or not. This indicates how much to
- * copy, even for HCMDs it must be big enough to fit the
- * DRAM scratch from the TX cmd, at least 16 bytes.
+ * it should be mapped or not. This indicates how big the
+ * first TB must be to include the scratch buffer. Since
+ * the scratch is 4 bytes at offset 12, it's 16 now. If we
+ * make it bigger then allocations will be bigger and copy
+ * slower, so that's probably not useful.
  */
-#define IWL_HCMD_MIN_COPY_SIZE	16
+#define IWL_HCMD_SCRATCHBUF_SIZE	16
 
 struct iwl_pcie_txq_entry {
 	struct iwl_device_cmd *cmd;
-	struct iwl_device_cmd *copy_cmd;
 	struct sk_buff *skb;
 	/* buffer to free after command completes */
 	const void *free_buf;
 	struct iwl_cmd_meta meta;
 };
 
+struct iwl_pcie_txq_scratch_buf {
+	struct iwl_cmd_header hdr;
+	u8 buf[8];
+	__le32 scratch;
+};
+
 /**
  * struct iwl_txq - Tx Queue for DMA
  * @q: generic Rx/Tx queue descriptor
  * @tfds: transmit frame descriptors (DMA memory)
+ * @scratchbufs: start of command headers, including scratch buffers, for
+ *	the writeback -- this is DMA memory and an array holding one buffer
+ *	for each command on the queue
+ * @scratchbufs_dma: DMA address for the scratchbufs start
  * @entries: transmit entries (driver state)
  * @lock: queue lock
  * @stuck_timer: timer that fires if queue gets stuck
@@ -217,6 +224,8 @@
 struct iwl_txq {
 	struct iwl_queue q;
 	struct iwl_tfd *tfds;
+	struct iwl_pcie_txq_scratch_buf *scratchbufs;
+	dma_addr_t scratchbufs_dma;
 	struct iwl_pcie_txq_entry *entries;
 	spinlock_t lock;
 	struct timer_list stuck_timer;
@@ -225,6 +234,13 @@
 	u8 active;
 };
 
+static inline dma_addr_t
+iwl_pcie_get_scratchbuf_dma(struct iwl_txq *txq, int idx)
+{
+	return txq->scratchbufs_dma +
+	       sizeof(struct iwl_pcie_txq_scratch_buf) * idx;
+}
+
 /**
  * struct iwl_trans_pcie - PCIe transport specific data
  * @rxq: all the RX queue data