bnx2x: Multiple concurrent l2 traffic classes

Overview:
 Support mapping of priorities to traffic classes and
 traffic classes to transmission queues ranges in the net device.
 The queue ranges are (count, offset) pairs relating to the txq
 array.
 This can be done via DCBX negotiation or by kernel.
 As a result Enhanced Transmission Selection (ETS) and Priority Flow
 Control (PFC) are supported between L2 network traffic classes.

 Mapping:
 This patch uses the netdev_set_num_tc, netdev_set_prio_tc_map and
 netdev_set_tc_queue functions to map priorities to traffic classes
 and traffic classes to transmission queue ranges.
 This mapping is performed by bnx2x_setup_tc function which is
 connected to the ndo_setup_tc.
 This function is always called at nic load where by default it
 maps all priorities to tc 0, and it may also be called by the
 kernel or by the bnx2x upon DCBX negotiation to modify the mapping.

 rtnl lock:
 When the ndo_setup_tc is called at nic load or by kernel the rtnl
 lock is already taken. However, when DCBX negotiation takes place
 the lock is not taken. The work is therefore scheduled to be
 handled by the sp_rtnl task.

 Fastpath:
 The fastpath structure of the bnx2x which was previously used
 to hold the information of one tx queue and one rx queue was
 redesigned to represent multiple tx queues, one for each traffic
 class.
 The transmission queue supplied in the skb by the kernel can no
 longer be interpreted as a straightforward index into the fastpath
 structure array, but it must rather be decoded to the appropriate
 fastpath index and the tc within that fastpath.

 Slowpath:
 The bnx2x's queue object was redesigned to accommodate multiple
 transmission queues. The queue object's state machine was enhanced
 to allow opening multiple transmission-only connections on top of
 the regular tx-rx connection.

 Firmware:
 This feature relies on the tx-only queue feature introduced in the
 bnx2x 7.0.23 firmware and the FW likewise must have the bnx2x multi
 cos support.

 Signed-off-by: Ariel Elior <ariele@broadcom.com>
 Signed-off-by: Eilon Greenstein <eilong@broadcom.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 8763625..e5fac62 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -47,6 +47,25 @@
 
 	/* Restore the NAPI object as it has been already initialized */
 	fp->napi = orig_napi;
+
+	fp->bp = bp;
+	fp->index = index;
+	if (IS_ETH_FP(fp))
+		fp->max_cos = bp->max_cos;
+	else
+		/* Special queues support only one CoS */
+		fp->max_cos = 1;
+
+	/*
+	 * set the tpa flag for each queue. The tpa flag determines the queue
+	 * minimal size so it must be set prior to queue memory allocation
+	 */
+	fp->disable_tpa = ((bp->flags & TPA_ENABLE_FLAG) == 0);
+
+#ifdef BCM_CNIC
+	/* We don't want TPA on FCoE, FWD and OOO L2 rings */
+	bnx2x_fcoe(bp, disable_tpa) = 1;
+#endif
 }
 
 /**
@@ -77,10 +96,10 @@
 /* free skb in the packet ring at pos idx
  * return idx of last bd freed
  */
-static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata,
 			     u16 idx)
 {
-	struct sw_tx_bd *tx_buf = &fp->tx_buf_ring[idx];
+	struct sw_tx_bd *tx_buf = &txdata->tx_buf_ring[idx];
 	struct eth_tx_start_bd *tx_start_bd;
 	struct eth_tx_bd *tx_data_bd;
 	struct sk_buff *skb = tx_buf->skb;
@@ -91,11 +110,11 @@
 	prefetch(&skb->end);
 
 	DP(BNX2X_MSG_FP, "fp[%d]: pkt_idx %d  buff @(%p)->skb %p\n",
-	   fp->index, idx, tx_buf, skb);
+	   txdata->txq_index, idx, tx_buf, skb);
 
 	/* unmap first bd */
 	DP(BNX2X_MSG_OFF, "free bd_idx %d\n", bd_idx);
-	tx_start_bd = &fp->tx_desc_ring[bd_idx].start_bd;
+	tx_start_bd = &txdata->tx_desc_ring[bd_idx].start_bd;
 	dma_unmap_single(&bp->pdev->dev, BD_UNMAP_ADDR(tx_start_bd),
 			 BD_UNMAP_LEN(tx_start_bd), DMA_TO_DEVICE);
 
@@ -126,7 +145,7 @@
 	while (nbd > 0) {
 
 		DP(BNX2X_MSG_OFF, "free frag bd_idx %d\n", bd_idx);
-		tx_data_bd = &fp->tx_desc_ring[bd_idx].reg_bd;
+		tx_data_bd = &txdata->tx_desc_ring[bd_idx].reg_bd;
 		dma_unmap_page(&bp->pdev->dev, BD_UNMAP_ADDR(tx_data_bd),
 			       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
 		if (--nbd)
@@ -142,20 +161,19 @@
 	return new_cons;
 }
 
-int bnx2x_tx_int(struct bnx2x_fastpath *fp)
+int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata)
 {
-	struct bnx2x *bp = fp->bp;
 	struct netdev_queue *txq;
-	u16 hw_cons, sw_cons, bd_cons = fp->tx_bd_cons;
+	u16 hw_cons, sw_cons, bd_cons = txdata->tx_bd_cons;
 
 #ifdef BNX2X_STOP_ON_ERROR
 	if (unlikely(bp->panic))
 		return -1;
 #endif
 
-	txq = netdev_get_tx_queue(bp->dev, fp->index);
-	hw_cons = le16_to_cpu(*fp->tx_cons_sb);
-	sw_cons = fp->tx_pkt_cons;
+	txq = netdev_get_tx_queue(bp->dev, txdata->txq_index);
+	hw_cons = le16_to_cpu(*txdata->tx_cons_sb);
+	sw_cons = txdata->tx_pkt_cons;
 
 	while (sw_cons != hw_cons) {
 		u16 pkt_cons;
@@ -164,14 +182,14 @@
 
 		DP(NETIF_MSG_TX_DONE, "queue[%d]: hw_cons %u  sw_cons %u "
 				      " pkt_cons %u\n",
-		   fp->index, hw_cons, sw_cons, pkt_cons);
+		   txdata->txq_index, hw_cons, sw_cons, pkt_cons);
 
-		bd_cons = bnx2x_free_tx_pkt(bp, fp, pkt_cons);
+		bd_cons = bnx2x_free_tx_pkt(bp, txdata, pkt_cons);
 		sw_cons++;
 	}
 
-	fp->tx_pkt_cons = sw_cons;
-	fp->tx_bd_cons = bd_cons;
+	txdata->tx_pkt_cons = sw_cons;
+	txdata->tx_bd_cons = bd_cons;
 
 	/* Need to make the tx_bd_cons update visible to start_xmit()
 	 * before checking for netif_tx_queue_stopped().  Without the
@@ -199,7 +217,7 @@
 
 		if ((netif_tx_queue_stopped(txq)) &&
 		    (bp->state == BNX2X_STATE_OPEN) &&
-		    (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3))
+		    (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3))
 			netif_tx_wake_queue(txq);
 
 		__netif_tx_unlock(txq);
@@ -777,6 +795,7 @@
 {
 	struct bnx2x_fastpath *fp = fp_cookie;
 	struct bnx2x *bp = fp->bp;
+	u8 cos;
 
 	DP(BNX2X_MSG_FP, "got an MSI-X interrupt on IDX:SB "
 			 "[fp %d fw_sd %d igusb %d]\n",
@@ -790,7 +809,10 @@
 
 	/* Handle Rx and Tx according to MSI-X vector */
 	prefetch(fp->rx_cons_sb);
-	prefetch(fp->tx_cons_sb);
+
+	for_each_cos_in_tx_queue(fp, cos)
+		prefetch(fp->txdata[cos].tx_cons_sb);
+
 	prefetch(&fp->sb_running_index[SM_RX_ID]);
 	napi_schedule(&bnx2x_fp(bp, fp->index, napi));
 
@@ -1060,17 +1082,22 @@
 static void bnx2x_free_tx_skbs(struct bnx2x *bp)
 {
 	int i;
+	u8 cos;
 
 	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
 
-		u16 bd_cons = fp->tx_bd_cons;
-		u16 sw_prod = fp->tx_pkt_prod;
-		u16 sw_cons = fp->tx_pkt_cons;
+			u16 bd_cons = txdata->tx_bd_cons;
+			u16 sw_prod = txdata->tx_pkt_prod;
+			u16 sw_cons = txdata->tx_pkt_cons;
 
-		while (sw_cons != sw_prod) {
-			bd_cons = bnx2x_free_tx_pkt(bp, fp, TX_BD(sw_cons));
-			sw_cons++;
+			while (sw_cons != sw_prod) {
+				bd_cons = bnx2x_free_tx_pkt(bp, txdata,
+							    TX_BD(sw_cons));
+				sw_cons++;
+			}
 		}
 	}
 }
@@ -1174,7 +1201,7 @@
 {
 	if (bp->flags & USING_MSIX_FLAG)
 		bnx2x_free_msix_irqs(bp, BNX2X_NUM_ETH_QUEUES(bp) +
-				     CNIC_CONTEXT_USE + 1);
+				     CNIC_PRESENT + 1);
 	else if (bp->flags & USING_MSI_FLAG)
 		free_irq(bp->pdev->irq, bp->dev);
 	else
@@ -1196,6 +1223,7 @@
 	   bp->msix_table[msix_vec].entry, bp->msix_table[msix_vec].entry);
 	msix_vec++;
 #endif
+	/* We need separate vectors for ETH queues only (not FCoE) */
 	for_each_eth_queue(bp, i) {
 		bp->msix_table[msix_vec].entry = msix_vec;
 		DP(NETIF_MSG_IFUP, "msix_table[%d].entry = %d "
@@ -1203,7 +1231,7 @@
 		msix_vec++;
 	}
 
-	req_cnt = BNX2X_NUM_ETH_QUEUES(bp) + CNIC_CONTEXT_USE + 1;
+	req_cnt = BNX2X_NUM_ETH_QUEUES(bp) + CNIC_PRESENT + 1;
 
 	rc = pci_enable_msix(bp->pdev, &bp->msix_table[0], req_cnt);
 
@@ -1278,7 +1306,7 @@
 	}
 
 	i = BNX2X_NUM_ETH_QUEUES(bp);
-	offset = 1 + CNIC_CONTEXT_USE;
+	offset = 1 + CNIC_PRESENT;
 	netdev_info(bp->dev, "using MSI-X  IRQs: sp %d  fp[%d] %d"
 	       " ... fp[%d] %d\n",
 	       bp->msix_table[0].vector,
@@ -1393,13 +1421,12 @@
 
 		/* If ethertype is FCoE or FIP - use FCoE ring */
 		if ((ether_type == ETH_P_FCOE) || (ether_type == ETH_P_FIP))
-			return bnx2x_fcoe(bp, index);
+			return bnx2x_fcoe_tx(bp, txq_index);
 	}
 #endif
 	/* Select a none-FCoE queue:  if FCoE is enabled, exclude FCoE L2 ring
 	 */
-	return __skb_tx_hash(dev, skb,
-			dev->real_num_tx_queues - FCOE_CONTEXT_USE);
+	return __skb_tx_hash(dev, skb, BNX2X_NUM_ETH_QUEUES(bp));
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
@@ -1418,20 +1445,38 @@
 	}
 
 	/* Add special queues */
-	bp->num_queues += NONE_ETH_CONTEXT_USE;
+	bp->num_queues += NON_ETH_CONTEXT_USE;
 }
 
 static inline int bnx2x_set_real_num_queues(struct bnx2x *bp)
 {
-	int rc, num = bp->num_queues;
+	int rc, tx, rx;
 
+	tx = MAX_TXQS_PER_COS * bp->max_cos;
+	rx = BNX2X_NUM_ETH_QUEUES(bp);
+
+/* account for fcoe queue */
 #ifdef BCM_CNIC
-	if (NO_FCOE(bp))
-		num -= FCOE_CONTEXT_USE;
-
+	if (!NO_FCOE(bp)) {
+		rx += FCOE_PRESENT;
+		tx += FCOE_PRESENT;
+	}
 #endif
-	netif_set_real_num_tx_queues(bp->dev, num);
-	rc = netif_set_real_num_rx_queues(bp->dev, num);
+
+	rc = netif_set_real_num_tx_queues(bp->dev, tx);
+	if (rc) {
+		BNX2X_ERR("Failed to set real number of Tx queues: %d\n", rc);
+		return rc;
+	}
+	rc = netif_set_real_num_rx_queues(bp->dev, rx);
+	if (rc) {
+		BNX2X_ERR("Failed to set real number of Rx queues: %d\n", rc);
+		return rc;
+	}
+
+	DP(NETIF_MSG_DRV, "Setting real num queues to (tx, rx) (%d, %d)\n",
+			  tx, rx);
+
 	return rc;
 }
 
@@ -1661,28 +1706,18 @@
 	/* must be called before memory allocation and HW init */
 	bnx2x_ilt_set_info(bp);
 
-	/* zero fastpath structures preserving invariants like napi which are
-	 * allocated only once
+	/*
+	 * Zero fastpath structures preserving invariants like napi, which are
+	 * allocated only once, fp index, max_cos, bp pointer.
+	 * Also set fp->disable_tpa.
 	 */
 	for_each_queue(bp, i)
 		bnx2x_bz_fp(bp, i);
 
+
 	/* Set the receive queues buffer size */
 	bnx2x_set_rx_buf_size(bp);
 
-	/*
-	 * set the tpa flag for each queue. The tpa flag determines the queue
-	 * minimal size so it must be set prior to queue memory allocation
-	 */
-	for_each_queue(bp, i)
-		bnx2x_fp(bp, i, disable_tpa) =
-					((bp->flags & TPA_ENABLE_FLAG) == 0);
-
-#ifdef BCM_CNIC
-	/* We don't want TPA on FCoE L2 ring */
-	bnx2x_fcoe(bp, disable_tpa) = 1;
-#endif
-
 	if (bnx2x_alloc_mem(bp))
 		return -ENOMEM;
 
@@ -1696,6 +1731,12 @@
 		LOAD_ERROR_EXIT(bp, load_error0);
 	}
 
+	/* configure multi cos mappings in kernel.
+	 * this configuration may be overriden by a multi class queue discipline
+	 * or by a dcbx negotiation result.
+	 */
+	bnx2x_setup_tc(bp->dev, bp->max_cos);
+
 	bnx2x_napi_enable(bp);
 
 	/* Send LOAD_REQUEST command to MCP
@@ -1747,6 +1788,7 @@
 		queue_delayed_work(bnx2x_wq, &bp->period_task, 0);
 	} else
 		bp->port.pmf = 0;
+
 	DP(NETIF_MSG_LINK, "pmf %d\n", bp->port.pmf);
 
 	/* Init Function state controlling object */
@@ -2089,6 +2131,7 @@
 int bnx2x_poll(struct napi_struct *napi, int budget)
 {
 	int work_done = 0;
+	u8 cos;
 	struct bnx2x_fastpath *fp = container_of(napi, struct bnx2x_fastpath,
 						 napi);
 	struct bnx2x *bp = fp->bp;
@@ -2101,8 +2144,10 @@
 		}
 #endif
 
-		if (bnx2x_has_tx_work(fp))
-			bnx2x_tx_int(fp);
+		for_each_cos_in_tx_queue(fp, cos)
+			if (bnx2x_tx_queue_has_work(&fp->txdata[cos]))
+				bnx2x_tx_int(bp, &fp->txdata[cos]);
+
 
 		if (bnx2x_has_rx_work(fp)) {
 			work_done += bnx2x_rx_int(fp, budget - work_done);
@@ -2164,7 +2209,7 @@
  * in Other Operating Systems(TM)
  */
 static noinline u16 bnx2x_tx_split(struct bnx2x *bp,
-				   struct bnx2x_fastpath *fp,
+				   struct bnx2x_fp_txdata *txdata,
 				   struct sw_tx_bd *tx_buf,
 				   struct eth_tx_start_bd **tx_bd, u16 hlen,
 				   u16 bd_prod, int nbd)
@@ -2185,7 +2230,7 @@
 	/* now get a new data BD
 	 * (after the pbd) and fill it */
 	bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
-	d_tx_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+	d_tx_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 
 	mapping = HILO_U64(le32_to_cpu(h_tx_bd->addr_hi),
 			   le32_to_cpu(h_tx_bd->addr_lo)) + hlen;
@@ -2481,8 +2526,10 @@
 netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct bnx2x *bp = netdev_priv(dev);
+
 	struct bnx2x_fastpath *fp;
 	struct netdev_queue *txq;
+	struct bnx2x_fp_txdata *txdata;
 	struct sw_tx_bd *tx_buf;
 	struct eth_tx_start_bd *tx_start_bd, *first_bd;
 	struct eth_tx_bd *tx_data_bd, *total_pkt_bd = NULL;
@@ -2490,7 +2537,7 @@
 	struct eth_tx_parse_bd_e2 *pbd_e2 = NULL;
 	u32 pbd_e2_parsing_data = 0;
 	u16 pkt_prod, bd_prod;
-	int nbd, fp_index;
+	int nbd, txq_index, fp_index, txdata_index;
 	dma_addr_t mapping;
 	u32 xmit_type = bnx2x_xmit_type(bp, skb);
 	int i;
@@ -2504,12 +2551,43 @@
 		return NETDEV_TX_BUSY;
 #endif
 
-	fp_index = skb_get_queue_mapping(skb);
-	txq = netdev_get_tx_queue(dev, fp_index);
+	txq_index = skb_get_queue_mapping(skb);
+	txq = netdev_get_tx_queue(dev, txq_index);
 
+	BUG_ON(txq_index >= MAX_ETH_TXQ_IDX(bp) + FCOE_PRESENT);
+
+	/* decode the fastpath index and the cos index from the txq */
+	fp_index = TXQ_TO_FP(txq_index);
+	txdata_index = TXQ_TO_COS(txq_index);
+
+#ifdef BCM_CNIC
+	/*
+	 * Override the above for the FCoE queue:
+	 *   - FCoE fp entry is right after the ETH entries.
+	 *   - FCoE L2 queue uses bp->txdata[0] only.
+	 */
+	if (unlikely(!NO_FCOE(bp) && (txq_index ==
+				      bnx2x_fcoe_tx(bp, txq_index)))) {
+		fp_index = FCOE_IDX;
+		txdata_index = 0;
+	}
+#endif
+
+	/* enable this debug print to view the transmission queue being used
+	DP(BNX2X_MSG_FP, "indices: txq %d, fp %d, txdata %d",
+	   txq_index, fp_index, txdata_index); */
+
+	/* locate the fastpath and the txdata */
 	fp = &bp->fp[fp_index];
+	txdata = &fp->txdata[txdata_index];
 
-	if (unlikely(bnx2x_tx_avail(fp) < (skb_shinfo(skb)->nr_frags + 3))) {
+	/* enable this debug print to view the tranmission details
+	DP(BNX2X_MSG_FP,"transmitting packet cid %d fp index %d txdata_index %d"
+			" tx_data ptr %p fp pointer %p",
+	   txdata->cid, fp_index, txdata_index, txdata, fp); */
+
+	if (unlikely(bnx2x_tx_avail(bp, txdata) <
+		     (skb_shinfo(skb)->nr_frags + 3))) {
 		fp->eth_q_stats.driver_xoff++;
 		netif_tx_stop_queue(txq);
 		BNX2X_ERR("BUG! Tx ring full when queue awake!\n");
@@ -2518,7 +2596,7 @@
 
 	DP(NETIF_MSG_TX_QUEUED, "queue[%d]: SKB: summed %x  protocol %x  "
 				"protocol(%x,%x) gso type %x  xmit_type %x\n",
-	   fp_index, skb->ip_summed, skb->protocol, ipv6_hdr(skb)->nexthdr,
+	   txq_index, skb->ip_summed, skb->protocol, ipv6_hdr(skb)->nexthdr,
 	   ip_hdr(skb)->protocol, skb_shinfo(skb)->gso_type, xmit_type);
 
 	eth = (struct ethhdr *)skb->data;
@@ -2567,15 +2645,15 @@
 	/* get current pkt produced now - advance it just before sending packet
 	 * since mapping of pages may fail and cause packet to be dropped
 	 */
-	pkt_prod = fp->tx_pkt_prod;
-	bd_prod = TX_BD(fp->tx_bd_prod);
+	pkt_prod = txdata->tx_pkt_prod;
+	bd_prod = TX_BD(txdata->tx_bd_prod);
 
 	/* get a tx_buf and first BD
 	 * tx_start_bd may be changed during SPLIT,
 	 * but first_bd will always stay first
 	 */
-	tx_buf = &fp->tx_buf_ring[TX_BD(pkt_prod)];
-	tx_start_bd = &fp->tx_desc_ring[bd_prod].start_bd;
+	tx_buf = &txdata->tx_buf_ring[TX_BD(pkt_prod)];
+	tx_start_bd = &txdata->tx_desc_ring[bd_prod].start_bd;
 	first_bd = tx_start_bd;
 
 	tx_start_bd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_START_BD;
@@ -2586,13 +2664,13 @@
 	SET_FLAG(tx_start_bd->general_data, ETH_TX_START_BD_HDR_NBDS, 1);
 
 	/* remember the first BD of the packet */
-	tx_buf->first_bd = fp->tx_bd_prod;
+	tx_buf->first_bd = txdata->tx_bd_prod;
 	tx_buf->skb = skb;
 	tx_buf->flags = 0;
 
 	DP(NETIF_MSG_TX_QUEUED,
 	   "sending pkt %u @%p  next_idx %u  bd %u @%p\n",
-	   pkt_prod, tx_buf, fp->tx_pkt_prod, bd_prod, tx_start_bd);
+	   pkt_prod, tx_buf, txdata->tx_pkt_prod, bd_prod, tx_start_bd);
 
 	if (vlan_tx_tag_present(skb)) {
 		tx_start_bd->vlan_or_ethertype =
@@ -2609,7 +2687,7 @@
 		bnx2x_set_sbd_csum(bp, skb, tx_start_bd, xmit_type);
 
 	if (!CHIP_IS_E1x(bp)) {
-		pbd_e2 = &fp->tx_desc_ring[bd_prod].parse_bd_e2;
+		pbd_e2 = &txdata->tx_desc_ring[bd_prod].parse_bd_e2;
 		memset(pbd_e2, 0, sizeof(struct eth_tx_parse_bd_e2));
 		/* Set PBD in checksum offload case */
 		if (xmit_type & XMIT_CSUM)
@@ -2631,7 +2709,7 @@
 					      eth->h_dest);
 		}
 	} else {
-		pbd_e1x = &fp->tx_desc_ring[bd_prod].parse_bd_e1x;
+		pbd_e1x = &txdata->tx_desc_ring[bd_prod].parse_bd_e1x;
 		memset(pbd_e1x, 0, sizeof(struct eth_tx_parse_bd_e1x));
 		/* Set PBD in checksum offload case */
 		if (xmit_type & XMIT_CSUM)
@@ -2663,8 +2741,9 @@
 		tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_SW_LSO;
 
 		if (unlikely(skb_headlen(skb) > hlen))
-			bd_prod = bnx2x_tx_split(bp, fp, tx_buf, &tx_start_bd,
-						 hlen, bd_prod, ++nbd);
+			bd_prod = bnx2x_tx_split(bp, txdata, tx_buf,
+						 &tx_start_bd, hlen,
+						 bd_prod, ++nbd);
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_set_pbd_gso_e2(skb, &pbd_e2_parsing_data,
 					     xmit_type);
@@ -2698,14 +2777,15 @@
 			 * before call to bnx2x_free_tx_pkt
 			 */
 			first_bd->nbd = cpu_to_le16(nbd);
-			bnx2x_free_tx_pkt(bp, fp, TX_BD(fp->tx_pkt_prod));
+			bnx2x_free_tx_pkt(bp, txdata,
+					  TX_BD(txdata->tx_pkt_prod));
 			return NETDEV_TX_OK;
 		}
 
 		bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
-		tx_data_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+		tx_data_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 		if (total_pkt_bd == NULL)
-			total_pkt_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+			total_pkt_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 
 		tx_data_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 		tx_data_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -2759,7 +2839,7 @@
 		   pbd_e2->parsing_data);
 	DP(NETIF_MSG_TX_QUEUED, "doorbell: nbd %d  bd %u\n", nbd, bd_prod);
 
-	fp->tx_pkt_prod++;
+	txdata->tx_pkt_prod++;
 	/*
 	 * Make sure that the BD data is updated before updating the producer
 	 * since FW might read the BD right after the producer is updated.
@@ -2769,16 +2849,16 @@
 	 */
 	wmb();
 
-	fp->tx_db.data.prod += nbd;
+	txdata->tx_db.data.prod += nbd;
 	barrier();
 
-	DOORBELL(bp, fp->cid, fp->tx_db.raw);
+	DOORBELL(bp, txdata->cid, txdata->tx_db.raw);
 
 	mmiowb();
 
-	fp->tx_bd_prod += nbd;
+	txdata->tx_bd_prod += nbd;
 
-	if (unlikely(bnx2x_tx_avail(fp) < MAX_SKB_FRAGS + 3)) {
+	if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 3)) {
 		netif_tx_stop_queue(txq);
 
 		/* paired memory barrier is in bnx2x_tx_int(), we have to keep
@@ -2787,14 +2867,81 @@
 		smp_mb();
 
 		fp->eth_q_stats.driver_xoff++;
-		if (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)
+		if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3)
 			netif_tx_wake_queue(txq);
 	}
-	fp->tx_pkt++;
+	txdata->tx_pkt++;
 
 	return NETDEV_TX_OK;
 }
 
+/**
+ * bnx2x_setup_tc - routine to configure net_device for multi tc
+ *
+ * @netdev: net device to configure
+ * @tc: number of traffic classes to enable
+ *
+ * callback connected to the ndo_setup_tc function pointer
+ */
+int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
+{
+	int cos, prio, count, offset;
+	struct bnx2x *bp = netdev_priv(dev);
+
+	/* setup tc must be called under rtnl lock */
+	ASSERT_RTNL();
+
+	/* no traffic classes requested. aborting */
+	if (!num_tc) {
+		netdev_reset_tc(dev);
+		return 0;
+	}
+
+	/* requested to support too many traffic classes */
+	if (num_tc > bp->max_cos) {
+		DP(NETIF_MSG_TX_ERR, "support for too many traffic classes"
+				     " requested: %d. max supported is %d",
+				     num_tc, bp->max_cos);
+		return -EINVAL;
+	}
+
+	/* declare amount of supported traffic classes */
+	if (netdev_set_num_tc(dev, num_tc)) {
+		DP(NETIF_MSG_TX_ERR, "failed to declare %d traffic classes",
+				     num_tc);
+		return -EINVAL;
+	}
+
+	/* configure priority to traffic class mapping */
+	for (prio = 0; prio < BNX2X_MAX_PRIORITY; prio++) {
+		netdev_set_prio_tc_map(dev, prio, bp->prio_to_cos[prio]);
+		DP(BNX2X_MSG_SP, "mapping priority %d to tc %d",
+		   prio, bp->prio_to_cos[prio]);
+	}
+
+
+	/* Use this configuration to diffrentiate tc0 from other COSes
+	   This can be used for ets or pfc, and save the effort of setting
+	   up a multio class queue disc or negotiating DCBX with a switch
+	netdev_set_prio_tc_map(dev, 0, 0);
+	DP(BNX2X_MSG_SP, "mapping priority %d to tc %d", 0, 0);
+	for (prio = 1; prio < 16; prio++) {
+		netdev_set_prio_tc_map(dev, prio, 1);
+		DP(BNX2X_MSG_SP, "mapping priority %d to tc %d", prio, 1);
+	} */
+
+	/* configure traffic class to transmission queue mapping */
+	for (cos = 0; cos < bp->max_cos; cos++) {
+		count = BNX2X_NUM_ETH_QUEUES(bp);
+		offset = cos * MAX_TXQS_PER_COS;
+		netdev_set_tc_queue(dev, cos, count, offset);
+		DP(BNX2X_MSG_SP, "mapping tc %d to offset %d count %d",
+		   cos, offset, count);
+	}
+
+	return 0;
+}
+
 /* called with rtnl_lock */
 int bnx2x_change_mac_addr(struct net_device *dev, void *p)
 {
@@ -2823,6 +2970,7 @@
 {
 	union host_hc_status_block *sb = &bnx2x_fp(bp, fp_index, status_blk);
 	struct bnx2x_fastpath *fp = &bp->fp[fp_index];
+	u8 cos;
 
 	/* Common */
 #ifdef BCM_CNIC
@@ -2871,10 +3019,18 @@
 	/* Tx */
 	if (!skip_tx_queue(bp, fp_index)) {
 		/* fastpath tx rings: tx_buf tx_desc */
-		BNX2X_FREE(bnx2x_fp(bp, fp_index, tx_buf_ring));
-		BNX2X_PCI_FREE(bnx2x_fp(bp, fp_index, tx_desc_ring),
-			       bnx2x_fp(bp, fp_index, tx_desc_mapping),
-			       sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
+
+			DP(BNX2X_MSG_SP,
+			   "freeing tx memory of fp %d cos %d cid %d",
+			   fp_index, cos, txdata->cid);
+
+			BNX2X_FREE(txdata->tx_buf_ring);
+			BNX2X_PCI_FREE(txdata->tx_desc_ring,
+				txdata->tx_desc_mapping,
+				sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		}
 	}
 	/* end of fastpath */
 }
@@ -2907,19 +3063,17 @@
 	union host_hc_status_block *sb;
 	struct bnx2x_fastpath *fp = &bp->fp[index];
 	int ring_size = 0;
+	u8 cos;
 
 	/* if rx_ring_size specified - use it */
 	int rx_ring_size = bp->rx_ring_size ? bp->rx_ring_size :
-			   MAX_RX_AVAIL/bp->num_queues;
+			   MAX_RX_AVAIL/BNX2X_NUM_RX_QUEUES(bp);
 
 	/* allocate at least number of buffers required by FW */
-	rx_ring_size = max_t(int, fp->disable_tpa ? MIN_RX_SIZE_NONTPA :
+	rx_ring_size = max_t(int, bp->disable_tpa ? MIN_RX_SIZE_NONTPA :
 						    MIN_RX_SIZE_TPA,
 				  rx_ring_size);
 
-	bnx2x_fp(bp, index, bp) = bp;
-	bnx2x_fp(bp, index, index) = index;
-
 	/* Common */
 	sb = &bnx2x_fp(bp, index, status_blk);
 #ifdef BCM_CNIC
@@ -2947,11 +3101,19 @@
 	/* Tx */
 	if (!skip_tx_queue(bp, index)) {
 		/* fastpath tx rings: tx_buf tx_desc */
-		BNX2X_ALLOC(bnx2x_fp(bp, index, tx_buf_ring),
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
+
+			DP(BNX2X_MSG_SP, "allocating tx memory of "
+					 "fp %d cos %d",
+			   index, cos);
+
+			BNX2X_ALLOC(txdata->tx_buf_ring,
 				sizeof(struct sw_tx_bd) * NUM_TX_BD);
-		BNX2X_PCI_ALLOC(bnx2x_fp(bp, index, tx_desc_ring),
-				&bnx2x_fp(bp, index, tx_desc_mapping),
+			BNX2X_PCI_ALLOC(txdata->tx_desc_ring,
+				&txdata->tx_desc_mapping,
 				sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		}
 	}
 
 	/* Rx */
@@ -2994,7 +3156,7 @@
 						index, ring_size);
 	/* FW will drop all packets if queue is not big enough,
 	 * In these cases we disable the queue
-	 * Min size diferent for TPA and non-TPA queues
+	 * Min size is different for OOO, TPA and non-TPA queues
 	 */
 	if (ring_size < (fp->disable_tpa ?
 				MIN_RX_SIZE_NONTPA : MIN_RX_SIZE_TPA)) {
@@ -3012,12 +3174,14 @@
 	/**
 	 * 1. Allocate FP for leading - fatal if error
 	 * 2. {CNIC} Allocate FCoE FP - fatal if error
-	 * 3. Allocate RSS - fix number of queues if error
+	 * 3. {CNIC} Allocate OOO + FWD - disable OOO if error
+	 * 4. Allocate RSS - fix number of queues if error
 	 */
 
 	/* leading */
 	if (bnx2x_alloc_fp_mem_at(bp, 0))
 		return -ENOMEM;
+
 #ifdef BCM_CNIC
 	if (!NO_FCOE(bp))
 		/* FCoE */
@@ -3027,6 +3191,7 @@
 			 */
 			return -ENOMEM;
 #endif
+
 	/* RSS */
 	for_each_nondefault_eth_queue(bp, i)
 		if (bnx2x_alloc_fp_mem_at(bp, i))
@@ -3044,7 +3209,7 @@
 		 * FCOE_IDX < FWD_IDX < OOO_IDX
 		 */
 
-		/* move FCoE fp */
+		/* move FCoE fp even NO_FCOE_FLAG is on */
 		bnx2x_move_fp(bp, FCOE_IDX, FCOE_IDX - delta);
 #endif
 		bp->num_queues -= delta;
@@ -3067,16 +3232,23 @@
 	struct bnx2x_fastpath *fp;
 	struct msix_entry *tbl;
 	struct bnx2x_ilt *ilt;
+	int msix_table_size = 0;
 
-	/* fp array */
-	fp = kzalloc(L2_FP_COUNT(bp->l2_cid_count)*sizeof(*fp), GFP_KERNEL);
+	/*
+	 * The biggest MSI-X table we might need is as a maximum number of fast
+	 * path IGU SBs plus default SB (for PF).
+	 */
+	msix_table_size = bp->igu_sb_cnt + 1;
+
+	/* fp array: RSS plus CNIC related L2 queues */
+	fp = kzalloc((BNX2X_MAX_RSS_COUNT(bp) + NON_ETH_CONTEXT_USE) *
+		     sizeof(*fp), GFP_KERNEL);
 	if (!fp)
 		goto alloc_err;
 	bp->fp = fp;
 
 	/* msix table */
-	tbl = kzalloc((FP_SB_COUNT(bp->l2_cid_count) + 1) * sizeof(*tbl),
-				  GFP_KERNEL);
+	tbl = kzalloc(msix_table_size * sizeof(*tbl), GFP_KERNEL);
 	if (!tbl)
 		goto alloc_err;
 	bp->msix_table = tbl;