ath9k: use ieee80211_tx_status_noskb where possible

It removes the need for undoing the padding changes to skb->data and it
improves performance by eliminating one tx status lookup per MPDU in the
status path. It is also useful for preparing a follow-up fix to better
handle powersave filtering.

A side effect is that these counters, available via debugfs, become now invalid:

* dot11TransmittedFragmentCount
* dot11FrameDuplicateCount,
* dot11ReceivedFragmentCount
* dot11MulticastReceivedFrameCount

Signed-off-by: Felix Fietkau <nbd@nbd.name>
[kvalo@qca.qualcomm.com: add a note about counters, thanks to Zefir Kurtisi]
Signed-off-by: Kalle Valo <kvalo@qca.qualcomm.com>
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index 8ddd604..b99ffa0 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -50,9 +50,11 @@
 static void ath_tx_send_normal(struct ath_softc *sc, struct ath_txq *txq,
 			       struct ath_atx_tid *tid, struct sk_buff *skb);
 static void ath_tx_complete(struct ath_softc *sc, struct sk_buff *skb,
-			    int tx_flags, struct ath_txq *txq);
+			    int tx_flags, struct ath_txq *txq,
+			    struct ieee80211_sta *sta);
 static void ath_tx_complete_buf(struct ath_softc *sc, struct ath_buf *bf,
 				struct ath_txq *txq, struct list_head *bf_q,
+				struct ieee80211_sta *sta,
 				struct ath_tx_status *ts, int txok);
 static void ath_tx_txqaddbuf(struct ath_softc *sc, struct ath_txq *txq,
 			     struct list_head *head, bool internal);
@@ -77,6 +79,22 @@
 /* Aggregation logic */
 /*********************/
 
+static void ath_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+	struct ieee80211_sta *sta = info->status.status_driver_data[0];
+
+	if (info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS) {
+		ieee80211_tx_status(hw, skb);
+		return;
+	}
+
+	if (sta)
+		ieee80211_tx_status_noskb(hw, sta, info);
+
+	dev_kfree_skb(skb);
+}
+
 void ath_txq_lock(struct ath_softc *sc, struct ath_txq *txq)
 	__acquires(&txq->axq_lock)
 {
@@ -92,6 +110,7 @@
 void ath_txq_unlock_complete(struct ath_softc *sc, struct ath_txq *txq)
 	__releases(&txq->axq_lock)
 {
+	struct ieee80211_hw *hw = sc->hw;
 	struct sk_buff_head q;
 	struct sk_buff *skb;
 
@@ -100,7 +119,7 @@
 	spin_unlock_bh(&txq->axq_lock);
 
 	while ((skb = __skb_dequeue(&q)))
-		ieee80211_tx_status(sc->hw, skb);
+		ath_tx_status(hw, skb);
 }
 
 static void ath_tx_queue_tid(struct ath_softc *sc, struct ath_txq *txq,
@@ -253,7 +272,7 @@
 		}
 
 		list_add_tail(&bf->list, &bf_head);
-		ath_tx_complete_buf(sc, bf, txq, &bf_head, &ts, 0);
+		ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, &ts, 0);
 	}
 
 	if (sendbar) {
@@ -318,12 +337,12 @@
 		bf = fi->bf;
 
 		if (!bf) {
-			ath_tx_complete(sc, skb, ATH_TX_ERROR, txq);
+			ath_tx_complete(sc, skb, ATH_TX_ERROR, txq, NULL);
 			continue;
 		}
 
 		list_add_tail(&bf->list, &bf_head);
-		ath_tx_complete_buf(sc, bf, txq, &bf_head, &ts, 0);
+		ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, &ts, 0);
 	}
 }
 
@@ -426,12 +445,11 @@
 
 static void ath_tx_complete_aggr(struct ath_softc *sc, struct ath_txq *txq,
 				 struct ath_buf *bf, struct list_head *bf_q,
+				 struct ieee80211_sta *sta,
 				 struct ath_tx_status *ts, int txok)
 {
 	struct ath_node *an = NULL;
 	struct sk_buff *skb;
-	struct ieee80211_sta *sta;
-	struct ieee80211_hw *hw = sc->hw;
 	struct ieee80211_hdr *hdr;
 	struct ieee80211_tx_info *tx_info;
 	struct ath_atx_tid *tid = NULL;
@@ -460,12 +478,7 @@
 	for (i = 0; i < ts->ts_rateindex; i++)
 		retries += rates[i].count;
 
-	rcu_read_lock();
-
-	sta = ieee80211_find_sta_by_ifaddr(hw, hdr->addr1, hdr->addr2);
 	if (!sta) {
-		rcu_read_unlock();
-
 		INIT_LIST_HEAD(&bf_head);
 		while (bf) {
 			bf_next = bf->bf_next;
@@ -473,7 +486,7 @@
 			if (!bf->bf_state.stale || bf_next != NULL)
 				list_move_tail(&bf->list, &bf_head);
 
-			ath_tx_complete_buf(sc, bf, txq, &bf_head, ts, 0);
+			ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, ts, 0);
 
 			bf = bf_next;
 		}
@@ -583,7 +596,7 @@
 								ts);
 			}
 
-			ath_tx_complete_buf(sc, bf, txq, &bf_head, ts,
+			ath_tx_complete_buf(sc, bf, txq, &bf_head, sta, ts,
 				!txfail);
 		} else {
 			if (tx_info->flags & IEEE80211_TX_STATUS_EOSP) {
@@ -604,7 +617,8 @@
 					ath_tx_update_baw(sc, tid, seqno);
 
 					ath_tx_complete_buf(sc, bf, txq,
-							    &bf_head, ts, 0);
+							    &bf_head, NULL, ts,
+							    0);
 					bar_index = max_t(int, bar_index,
 						ATH_BA_INDEX(seq_first, seqno));
 					break;
@@ -648,8 +662,6 @@
 		ath_txq_lock(sc, txq);
 	}
 
-	rcu_read_unlock();
-
 	if (needreset)
 		ath9k_queue_reset(sc, RESET_TYPE_TX_ERROR);
 }
@@ -664,7 +676,10 @@
 				  struct ath_tx_status *ts, struct ath_buf *bf,
 				  struct list_head *bf_head)
 {
+	struct ieee80211_hw *hw = sc->hw;
 	struct ieee80211_tx_info *info;
+	struct ieee80211_sta *sta;
+	struct ieee80211_hdr *hdr;
 	bool txok, flush;
 
 	txok = !(ts->ts_status & ATH9K_TXERR_MASK);
@@ -677,6 +692,10 @@
 
 	ts->duration = ath9k_hw_get_duration(sc->sc_ah, bf->bf_desc,
 					     ts->ts_rateindex);
+
+	hdr = (struct ieee80211_hdr *) bf->bf_mpdu->data;
+	sta = ieee80211_find_sta_by_ifaddr(hw, hdr->addr1, hdr->addr2);
+
 	if (!bf_isampdu(bf)) {
 		if (!flush) {
 			info = IEEE80211_SKB_CB(bf->bf_mpdu);
@@ -685,9 +704,9 @@
 			ath_tx_rc_status(sc, bf, ts, 1, txok ? 0 : 1, txok);
 			ath_dynack_sample_tx_ts(sc->sc_ah, bf->bf_mpdu, ts);
 		}
-		ath_tx_complete_buf(sc, bf, txq, bf_head, ts, txok);
+		ath_tx_complete_buf(sc, bf, txq, bf_head, sta, ts, txok);
 	} else
-		ath_tx_complete_aggr(sc, txq, bf, bf_head, ts, txok);
+		ath_tx_complete_aggr(sc, txq, bf, bf_head, sta, ts, txok);
 
 	if (!flush)
 		ath_txq_schedule(sc, txq);
@@ -923,7 +942,7 @@
 			list_add(&bf->list, &bf_head);
 			__skb_unlink(skb, *q);
 			ath_tx_update_baw(sc, tid, seqno);
-			ath_tx_complete_buf(sc, bf, txq, &bf_head, &ts, 0);
+			ath_tx_complete_buf(sc, bf, txq, &bf_head, NULL, &ts, 0);
 			continue;
 		}
 
@@ -1832,6 +1851,7 @@
  */
 void ath_draintxq(struct ath_softc *sc, struct ath_txq *txq)
 {
+	rcu_read_lock();
 	ath_txq_lock(sc, txq);
 
 	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA) {
@@ -1850,6 +1870,7 @@
 	ath_drain_txq_list(sc, txq, &txq->axq_q);
 
 	ath_txq_unlock_complete(sc, txq);
+	rcu_read_unlock();
 }
 
 bool ath_drain_all_txq(struct ath_softc *sc)
@@ -2472,7 +2493,8 @@
 /*****************/
 
 static void ath_tx_complete(struct ath_softc *sc, struct sk_buff *skb,
-			    int tx_flags, struct ath_txq *txq)
+			    int tx_flags, struct ath_txq *txq,
+			    struct ieee80211_sta *sta)
 {
 	struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
@@ -2492,15 +2514,17 @@
 			tx_info->flags |= IEEE80211_TX_STAT_ACK;
 	}
 
-	padpos = ieee80211_hdrlen(hdr->frame_control);
-	padsize = padpos & 3;
-	if (padsize && skb->len>padpos+padsize) {
-		/*
-		 * Remove MAC header padding before giving the frame back to
-		 * mac80211.
-		 */
-		memmove(skb->data + padsize, skb->data, padpos);
-		skb_pull(skb, padsize);
+	if (tx_info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS) {
+		padpos = ieee80211_hdrlen(hdr->frame_control);
+		padsize = padpos & 3;
+		if (padsize && skb->len>padpos+padsize) {
+			/*
+			 * Remove MAC header padding before giving the frame back to
+			 * mac80211.
+			 */
+			memmove(skb->data + padsize, skb->data, padpos);
+			skb_pull(skb, padsize);
+		}
 	}
 
 	spin_lock_irqsave(&sc->sc_pm_lock, flags);
@@ -2515,12 +2539,14 @@
 	}
 	spin_unlock_irqrestore(&sc->sc_pm_lock, flags);
 
-	__skb_queue_tail(&txq->complete_q, skb);
 	ath_txq_skb_done(sc, txq, skb);
+	tx_info->status.status_driver_data[0] = sta;
+	__skb_queue_tail(&txq->complete_q, skb);
 }
 
 static void ath_tx_complete_buf(struct ath_softc *sc, struct ath_buf *bf,
 				struct ath_txq *txq, struct list_head *bf_q,
+				struct ieee80211_sta *sta,
 				struct ath_tx_status *ts, int txok)
 {
 	struct sk_buff *skb = bf->bf_mpdu;
@@ -2548,7 +2574,7 @@
 			complete(&sc->paprd_complete);
 	} else {
 		ath_debug_stat_tx(sc, bf, ts, txq, tx_flags);
-		ath_tx_complete(sc, skb, tx_flags, txq);
+		ath_tx_complete(sc, skb, tx_flags, txq, sta);
 	}
 skip_tx_complete:
 	/* At this point, skb (bf->bf_mpdu) is consumed...make sure we don't
@@ -2700,10 +2726,12 @@
 	u32 qcumask = ((1 << ATH9K_NUM_TX_QUEUES) - 1) & ah->intr_txqs;
 	int i;
 
+	rcu_read_lock();
 	for (i = 0; i < ATH9K_NUM_TX_QUEUES; i++) {
 		if (ATH_TXQ_SETUP(sc, i) && (qcumask & (1 << i)))
 			ath_tx_processq(sc, &sc->tx.txq[i]);
 	}
+	rcu_read_unlock();
 }
 
 void ath_tx_edma_tasklet(struct ath_softc *sc)
@@ -2717,6 +2745,7 @@
 	struct list_head *fifo_list;
 	int status;
 
+	rcu_read_lock();
 	for (;;) {
 		if (test_bit(ATH_OP_HW_RESET, &common->op_flags))
 			break;
@@ -2787,6 +2816,7 @@
 		ath_tx_process_buffer(sc, txq, &ts, bf, &bf_head);
 		ath_txq_unlock_complete(sc, txq);
 	}
+	rcu_read_unlock();
 }
 
 /*****************/