be2net: add multiple RX queue support

This patch adds multiple RX queue support to be2net. There are
upto 4 extra rx-queues per port into which TCP/UDP traffic can be hashed into.
Some of the ethtool stats are now displayed on a per queue basis.

Signed-off-by: Sathya Perla <sathya.perla@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h
index 4faf696..1afabb1 100644
--- a/drivers/net/benet/be.h
+++ b/drivers/net/benet/be.h
@@ -78,6 +78,8 @@
 #define MCC_Q_LEN		128	/* total size not to exceed 8 pages */
 #define MCC_CQ_LEN		256
 
+#define MAX_RSS_QS		4	/* BE limit is 4 queues/port */
+#define BE_MAX_MSIX_VECTORS	(MAX_RSS_QS + 1 + 1)/* RSS qs + 1 def Rx + Tx */
 #define BE_NAPI_WEIGHT		64
 #define MAX_RX_POST 		BE_NAPI_WEIGHT /* Frags posted at a time */
 #define RX_FRAGS_REFILL_WM	(RX_Q_LEN - MAX_RX_POST)
@@ -157,10 +159,9 @@
 	bool rearm_cq;
 };
 
-struct be_drvr_stats {
+struct be_tx_stats {
 	u32 be_tx_reqs;		/* number of TX requests initiated */
 	u32 be_tx_stops;	/* number of times TX Q was stopped */
-	u32 be_fwd_reqs;	/* number of send reqs through forwarding i/f */
 	u32 be_tx_wrbs;		/* number of tx WRBs used */
 	u32 be_tx_events;	/* number of tx completion events  */
 	u32 be_tx_compl;	/* number of tx completion entries processed */
@@ -169,35 +170,6 @@
 	u64 be_tx_bytes_prev;
 	u64 be_tx_pkts;
 	u32 be_tx_rate;
-
-	u32 cache_barrier[16];
-
-	u32 be_ethrx_post_fail;/* number of ethrx buffer alloc failures */
-	u32 be_rx_polls;	/* number of times NAPI called poll function */
-	u32 be_rx_events;	/* number of ucast rx completion events  */
-	u32 be_rx_compl;	/* number of rx completion entries processed */
-	ulong be_rx_jiffies;
-	u64 be_rx_bytes;
-	u64 be_rx_bytes_prev;
-	u64 be_rx_pkts;
-	u32 be_rx_rate;
-	u32 be_rx_mcast_pkt;
-	/* number of non ether type II frames dropped where
-	 * frame len > length field of Mac Hdr */
-	u32 be_802_3_dropped_frames;
-	/* number of non ether type II frames malformed where
-	 * in frame len < length field of Mac Hdr */
-	u32 be_802_3_malformed_frames;
-	u32 be_rxcp_err;	/* Num rx completion entries w/ err set. */
-	ulong rx_fps_jiffies;	/* jiffies at last FPS calc */
-	u32 be_rx_frags;
-	u32 be_prev_rx_frags;
-	u32 be_rx_fps;		/* Rx frags per second */
-};
-
-struct be_stats_obj {
-	struct be_drvr_stats drvr_stats;
-	struct be_dma_mem cmd;
 };
 
 struct be_tx_obj {
@@ -215,10 +187,34 @@
 	bool last_page_user;
 };
 
+struct be_rx_stats {
+	u32 rx_post_fail;/* number of ethrx buffer alloc failures */
+	u32 rx_polls;	/* number of times NAPI called poll function */
+	u32 rx_events;	/* number of ucast rx completion events  */
+	u32 rx_compl;	/* number of rx completion entries processed */
+	ulong rx_jiffies;
+	u64 rx_bytes;
+	u64 rx_bytes_prev;
+	u64 rx_pkts;
+	u32 rx_rate;
+	u32 rx_mcast_pkts;
+	u32 rxcp_err;	/* Num rx completion entries w/ err set. */
+	ulong rx_fps_jiffies;	/* jiffies at last FPS calc */
+	u32 rx_frags;
+	u32 prev_rx_frags;
+	u32 rx_fps;		/* Rx frags per second */
+};
+
 struct be_rx_obj {
+	struct be_adapter *adapter;
 	struct be_queue_info q;
 	struct be_queue_info cq;
 	struct be_rx_page_info page_info_tbl[RX_Q_LEN];
+	struct be_eq_obj rx_eq;
+	struct be_rx_stats stats;
+	u8 rss_id;
+	bool rx_post_starved;	/* Zero rx frags have been posted to BE */
+	u32 cache_line_barrier[16];
 };
 
 struct be_vf_cfg {
@@ -229,7 +225,6 @@
 	u32 vf_tx_rate;
 };
 
-#define BE_NUM_MSIX_VECTORS		2	/* 1 each for Tx and Rx */
 #define BE_INVALID_PMAC_ID		0xffffffff
 struct be_adapter {
 	struct pci_dev *pdev;
@@ -249,21 +244,21 @@
 	spinlock_t mcc_lock;	/* For serializing mcc cmds to BE card */
 	spinlock_t mcc_cq_lock;
 
-	struct msix_entry msix_entries[BE_NUM_MSIX_VECTORS];
+	struct msix_entry msix_entries[BE_MAX_MSIX_VECTORS];
 	bool msix_enabled;
 	bool isr_registered;
 
 	/* TX Rings */
 	struct be_eq_obj tx_eq;
 	struct be_tx_obj tx_obj;
+	struct be_tx_stats tx_stats;
 
 	u32 cache_line_break[8];
 
 	/* Rx rings */
-	struct be_eq_obj rx_eq;
-	struct be_rx_obj rx_obj;
+	struct be_rx_obj rx_obj[MAX_RSS_QS + 1]; /* one default non-rss Q */
+	u32 num_rx_qs;
 	u32 big_page_size;	/* Compounded page size shared by rx wrbs */
-	bool rx_post_starved;	/* Zero rx frags have been posted to BE */
 
 	struct vlan_group *vlan_grp;
 	u16 vlans_added;
@@ -271,7 +266,7 @@
 	u8 vlan_tag[VLAN_GROUP_ARRAY_LEN];
 	struct be_dma_mem mc_cmd_mem;
 
-	struct be_stats_obj stats;
+	struct be_dma_mem stats_cmd;
 	/* Work queue used to perform periodic tasks like getting statistics */
 	struct delayed_work work;
 
@@ -287,6 +282,7 @@
 	bool promiscuous;
 	bool wol;
 	u32 function_mode;
+	u32 function_caps;
 	u32 rx_fc;		/* Rx flow control */
 	u32 tx_fc;		/* Tx flow control */
 	bool ue_detected;
@@ -313,10 +309,20 @@
 
 extern const struct ethtool_ops be_ethtool_ops;
 
-#define drvr_stats(adapter)		(&adapter->stats.drvr_stats)
+#define tx_stats(adapter)		(&adapter->tx_stats)
+#define rx_stats(rxo)			(&rxo->stats)
 
 #define BE_SET_NETDEV_OPS(netdev, ops)	(netdev->netdev_ops = ops)
 
+#define for_all_rx_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;	\
+		i++, rxo++)
+
+/* Just skip the first default non-rss queue */
+#define for_all_rss_queues(adapter, rxo, i)				\
+	for (i = 0, rxo = &adapter->rx_obj[i+1]; i < (adapter->num_rx_qs - 1);\
+		i++, rxo++)
+
 #define PAGE_SHIFT_4K		12
 #define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)