ath10k: Improve performance by reducing tx_lock contention

During tx completion, tx_lock is held for longer than required, preventing
efficient refill of htt->pending_tx. Refactor the code so that only MSDU
related operations are protected by the lock.

Improves downstream performance on a dual-core ARM Freescale LS1024A
(f.k.a. Mindspeed Comcerto 2000) AP with a 3x3 client from 495 to 580 Mbps.
Other CPU bound multicore systems may also benefit.

Signed-off-by: Denton Gentry <dgentry@google.com>
Signed-off-by: Avery Pennarun <apenwarr@google.com>
[mfaltesek@google.com: removed conflicting code for tracking msdu_ids.]
Signed-off-by: Marty Faltesek <mfaltesek@google.com>
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
diff --git a/drivers/net/wireless/ath/ath10k/htt_tx.c b/drivers/net/wireless/ath/ath10k/htt_tx.c
index a97dd9d..2c0627b 100644
--- a/drivers/net/wireless/ath/ath10k/htt_tx.c
+++ b/drivers/net/wireless/ath/ath10k/htt_tx.c
@@ -134,9 +134,7 @@
 	tx_done.discard = 1;
 	tx_done.msdu_id = msdu_id;
 
-	spin_lock_bh(&htt->tx_lock);
 	ath10k_txrx_tx_unref(htt, &tx_done);
-	spin_unlock_bh(&htt->tx_lock);
 
 	return 0;
 }
@@ -429,12 +427,11 @@
 
 	spin_lock_bh(&htt->tx_lock);
 	res = ath10k_htt_tx_alloc_msdu_id(htt, msdu);
+	spin_unlock_bh(&htt->tx_lock);
 	if (res < 0) {
-		spin_unlock_bh(&htt->tx_lock);
 		goto err_tx_dec;
 	}
 	msdu_id = res;
-	spin_unlock_bh(&htt->tx_lock);
 
 	txdesc = ath10k_htc_alloc_skb(ar, len);
 	if (!txdesc) {
@@ -506,12 +503,11 @@
 
 	spin_lock_bh(&htt->tx_lock);
 	res = ath10k_htt_tx_alloc_msdu_id(htt, msdu);
+	spin_unlock_bh(&htt->tx_lock);
 	if (res < 0) {
-		spin_unlock_bh(&htt->tx_lock);
 		goto err_tx_dec;
 	}
 	msdu_id = res;
-	spin_unlock_bh(&htt->tx_lock);
 
 	prefetch_len = min(htt->prefetch_len, msdu->len);
 	prefetch_len = roundup(prefetch_len, 4);