r8169: add support for Byte Queue Limits

tested on RTL8168d/8111d model using 'super_netperf 40' with TCP/UDP_STREAM.

Output of
while true; do
    for n in inflight limit; do
          echo -n $n\ ; cat $n;
    done;
    sleep 1;
done

during netperf run, 100mbit peer:

inflight 0
limit 3028
inflight 6056
limit 4542

[ trimmed output for brevity, no limit/inflight changes during
  test steady-state ]

limit 4542
inflight 3028
limit 6122
inflight 0
limit 6122
[ changed cable to 1gbit peer, restart netperf ]
inflight 37850
limit 36336
inflight 33308
limit 31794
inflight 33308
limit 31794
inflight 27252
limit 25738
[ again, no changes during test ]
inflight 27252
limit 25738
inflight 0
limit 28766
[ change cable to 100mbit peer, restart netperf ]
limit 28766
inflight 27370
limit 28766
inflight 4542
limit 5990
inflight 6056
limit 4542
[ .. ]
inflight 6056
limit 4542
inflight 0

[end of test]

Cc: Francois Romieu <romieu@fr.zoreil.com>
Cc: Hayes Wang <hayeswang@realtek.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index c03891b..54476ba 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4733,6 +4733,8 @@
 	RTL_W8(ChipCmd, CmdReset);
 
 	rtl_udelay_loop_wait_low(tp, &rtl_chipcmd_cond, 100, 100);
+
+	netdev_reset_queue(tp->dev);
 }
 
 static void rtl_request_uncached_firmware(struct rtl8169_private *tp)
@@ -6613,6 +6615,8 @@
 
 	txd->opts2 = cpu_to_le32(opts[1]);
 
+	netdev_sent_queue(dev, skb->len);
+
 	skb_tx_timestamp(skb);
 
 	wmb();
@@ -6712,6 +6716,7 @@
 static void rtl_tx(struct net_device *dev, struct rtl8169_private *tp)
 {
 	unsigned int dirty_tx, tx_left;
+	unsigned int bytes_compl = 0, pkts_compl = 0;
 
 	dirty_tx = tp->dirty_tx;
 	smp_rmb();
@@ -6730,10 +6735,8 @@
 		rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb,
 				     tp->TxDescArray + entry);
 		if (status & LastFrag) {
-			u64_stats_update_begin(&tp->tx_stats.syncp);
-			tp->tx_stats.packets++;
-			tp->tx_stats.bytes += tx_skb->skb->len;
-			u64_stats_update_end(&tp->tx_stats.syncp);
+			pkts_compl++;
+			bytes_compl += tx_skb->skb->len;
 			dev_kfree_skb_any(tx_skb->skb);
 			tx_skb->skb = NULL;
 		}
@@ -6742,6 +6745,13 @@
 	}
 
 	if (tp->dirty_tx != dirty_tx) {
+		netdev_completed_queue(tp->dev, pkts_compl, bytes_compl);
+
+		u64_stats_update_begin(&tp->tx_stats.syncp);
+		tp->tx_stats.packets += pkts_compl;
+		tp->tx_stats.bytes += bytes_compl;
+		u64_stats_update_end(&tp->tx_stats.syncp);
+
 		tp->dirty_tx = dirty_tx;
 		/* Sync with rtl8169_start_xmit:
 		 * - publish dirty_tx ring index (write barrier)