[NET]: Make NAPI polling independent of struct net_device objects.
Several devices have multiple independant RX queues per net
device, and some have a single interrupt doorbell for several
queues.
In either case, it's easier to support layouts like that if the
structure representing the poll is independant from the net
device itself.
The signature of the ->poll() call back goes from:
int foo_poll(struct net_device *dev, int *budget)
to
int foo_poll(struct napi_struct *napi, int budget)
The caller is returned the number of RX packets processed (or
the number of "NAPI credits" consumed if you want to get
abstract). The callee no longer messes around bumping
dev->quota, *budget, etc. because that is all handled in the
caller upon return.
The napi_struct is to be embedded in the device driver private data
structures.
Furthermore, it is the driver's responsibility to disable all NAPI
instances in it's ->stop() device close handler. Since the
napi_struct is privatized into the driver's private data structures,
only the driver knows how to get at all of the napi_struct instances
it may have per-device.
With lots of help and suggestions from Rusty Russell, Roland Dreier,
Michael Chan, Jeff Garzik, and Jamal Hadi Salim.
Bug fixes from Thomas Graf, Roland Dreier, Peter Zijlstra,
Joseph Fannin, Scott Wood, Hans J. Koch, and Michael Chan.
[ Ported to current tree and all drivers converted. Integrated
Stephen's follow-on kerneldoc additions, and restored poll_list
handling to the old style to fix mutual exclusion issues. -DaveM ]
Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 20e887d..0442617 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -49,11 +49,13 @@
typedef irqreturn_t(*intr_handler_t) (int, void *);
struct vlan_group;
-
struct adapter;
+struct sge_qset;
+
struct port_info {
struct adapter *adapter;
struct vlan_group *vlan_grp;
+ struct sge_qset *qs;
const struct port_type_info *port_type;
u8 port_id;
u8 rx_csum_offload;
@@ -173,10 +175,12 @@
};
struct sge_qset { /* an SGE queue set */
+ struct adapter *adap;
+ struct napi_struct napi;
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
struct sge_txq txq[SGE_TXQ_PER_SET];
- struct net_device *netdev; /* associated net device */
+ struct net_device *netdev;
unsigned long txq_stopped; /* which Tx queues are stopped */
struct timer_list tx_reclaim_timer; /* reclaims TX buffers */
unsigned long port_stats[SGE_PSTAT_MAX];
@@ -221,12 +225,6 @@
struct delayed_work adap_check_task;
struct work_struct ext_intr_handler_task;
- /*
- * Dummy netdevices are needed when using multiple receive queues with
- * NAPI as each netdevice can service only one queue.
- */
- struct net_device *dummy_netdev[SGE_QSETS - 1];
-
struct dentry *debugfs_root;
struct mutex mdio_lock;
@@ -253,12 +251,6 @@
return netdev_priv(adap->port[idx]);
}
-/*
- * We use the spare atalk_ptr to map a net device to its SGE queue set.
- * This is a macro so it can be used as l-value.
- */
-#define dev2qset(netdev) ((netdev)->atalk_ptr)
-
#define OFFLOAD_DEVMAP_BIT 15
#define tdev2adap(d) container_of(d, struct adapter, tdev)
@@ -284,7 +276,7 @@
void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
int irq_vec_idx, const struct qset_params *p,
- int ntxq, struct net_device *netdev);
+ int ntxq, struct net_device *dev);
int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
unsigned char *data);
irqreturn_t t3_sge_intr_msix(int irq, void *cookie);
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 5ab319c..5db7d4e 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -339,49 +339,17 @@
V_RRCPLCPUSIZE(6), cpus, rspq_map);
}
-/*
- * If we have multiple receive queues per port serviced by NAPI we need one
- * netdevice per queue as NAPI operates on netdevices. We already have one
- * netdevice, namely the one associated with the interface, so we use dummy
- * ones for any additional queues. Note that these netdevices exist purely
- * so that NAPI has something to work with, they do not represent network
- * ports and are not registered.
- */
-static int init_dummy_netdevs(struct adapter *adap)
+static void init_napi(struct adapter *adap)
{
- int i, j, dummy_idx = 0;
- struct net_device *nd;
+ int i;
- for_each_port(adap, i) {
- struct net_device *dev = adap->port[i];
- const struct port_info *pi = netdev_priv(dev);
+ for (i = 0; i < SGE_QSETS; i++) {
+ struct sge_qset *qs = &adap->sge.qs[i];
- for (j = 0; j < pi->nqsets - 1; j++) {
- if (!adap->dummy_netdev[dummy_idx]) {
- struct port_info *p;
-
- nd = alloc_netdev(sizeof(*p), "", ether_setup);
- if (!nd)
- goto free_all;
-
- p = netdev_priv(nd);
- p->adapter = adap;
- nd->weight = 64;
- set_bit(__LINK_STATE_START, &nd->state);
- adap->dummy_netdev[dummy_idx] = nd;
- }
- strcpy(adap->dummy_netdev[dummy_idx]->name, dev->name);
- dummy_idx++;
- }
+ if (qs->adap)
+ netif_napi_add(qs->netdev, &qs->napi, qs->napi.poll,
+ 64);
}
- return 0;
-
-free_all:
- while (--dummy_idx >= 0) {
- free_netdev(adap->dummy_netdev[dummy_idx]);
- adap->dummy_netdev[dummy_idx] = NULL;
- }
- return -ENOMEM;
}
/*
@@ -392,20 +360,18 @@
static void quiesce_rx(struct adapter *adap)
{
int i;
- struct net_device *dev;
- for_each_port(adap, i) {
- dev = adap->port[i];
- while (test_bit(__LINK_STATE_RX_SCHED, &dev->state))
- msleep(1);
- }
+ for (i = 0; i < SGE_QSETS; i++)
+ if (adap->sge.qs[i].adap)
+ napi_disable(&adap->sge.qs[i].napi);
+}
- for (i = 0; i < ARRAY_SIZE(adap->dummy_netdev); i++) {
- dev = adap->dummy_netdev[i];
- if (dev)
- while (test_bit(__LINK_STATE_RX_SCHED, &dev->state))
- msleep(1);
- }
+static void enable_all_napi(struct adapter *adap)
+{
+ int i;
+ for (i = 0; i < SGE_QSETS; i++)
+ if (adap->sge.qs[i].adap)
+ napi_enable(&adap->sge.qs[i].napi);
}
/**
@@ -418,7 +384,7 @@
*/
static int setup_sge_qsets(struct adapter *adap)
{
- int i, j, err, irq_idx = 0, qset_idx = 0, dummy_dev_idx = 0;
+ int i, j, err, irq_idx = 0, qset_idx = 0;
unsigned int ntxq = SGE_TXQ_PER_SET;
if (adap->params.rev > 0 && !(adap->flags & USING_MSI))
@@ -426,15 +392,14 @@
for_each_port(adap, i) {
struct net_device *dev = adap->port[i];
- const struct port_info *pi = netdev_priv(dev);
+ struct port_info *pi = netdev_priv(dev);
+ pi->qs = &adap->sge.qs[pi->first_qset];
for (j = 0; j < pi->nqsets; ++j, ++qset_idx) {
err = t3_sge_alloc_qset(adap, qset_idx, 1,
(adap->flags & USING_MSIX) ? qset_idx + 1 :
irq_idx,
- &adap->params.sge.qset[qset_idx], ntxq,
- j == 0 ? dev :
- adap-> dummy_netdev[dummy_dev_idx++]);
+ &adap->params.sge.qset[qset_idx], ntxq, dev);
if (err) {
t3_free_sge_resources(adap);
return err;
@@ -845,21 +810,18 @@
goto out;
}
- err = init_dummy_netdevs(adap);
- if (err)
- goto out;
-
err = t3_init_hw(adap, 0);
if (err)
goto out;
t3_write_reg(adap, A_ULPRX_TDDP_PSZ, V_HPZ0(PAGE_SHIFT - 12));
-
+
err = setup_sge_qsets(adap);
if (err)
goto out;
setup_rss(adap);
+ init_napi(adap);
adap->flags |= FULL_INIT_DONE;
}
@@ -886,6 +848,7 @@
adap->name, adap)))
goto irq_err;
+ enable_all_napi(adap);
t3_sge_start(adap);
t3_intr_enable(adap);
@@ -1012,8 +975,10 @@
int other_ports = adapter->open_device_map & PORT_MASK;
int err;
- if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0)
+ if (!adapter->open_device_map && (err = cxgb_up(adapter)) < 0) {
+ quiesce_rx(adapter);
return err;
+ }
set_bit(pi->port_id, &adapter->open_device_map);
if (is_offload(adapter) && !ofld_disable) {
@@ -2524,7 +2489,6 @@
#ifdef CONFIG_NET_POLL_CONTROLLER
netdev->poll_controller = cxgb_netpoll;
#endif
- netdev->weight = 64;
SET_ETHTOOL_OPS(netdev, &cxgb_ethtool_ops);
}
@@ -2625,12 +2589,6 @@
t3_free_sge_resources(adapter);
cxgb_disable_msi(adapter);
- for (i = 0; i < ARRAY_SIZE(adapter->dummy_netdev); i++)
- if (adapter->dummy_netdev[i]) {
- free_netdev(adapter->dummy_netdev[i]);
- adapter->dummy_netdev[i] = NULL;
- }
-
for_each_port(adapter, i)
if (adapter->port[i])
free_netdev(adapter->port[i]);
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 58a5f60..069c1ac 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -591,9 +591,6 @@
q->rspq.desc, q->rspq.phys_addr);
}
- if (q->netdev)
- q->netdev->atalk_ptr = NULL;
-
memset(q, 0, sizeof(*q));
}
@@ -1074,7 +1071,7 @@
unsigned int ndesc, pidx, credits, gen, compl;
const struct port_info *pi = netdev_priv(dev);
struct adapter *adap = pi->adapter;
- struct sge_qset *qs = dev2qset(dev);
+ struct sge_qset *qs = pi->qs;
struct sge_txq *q = &qs->txq[TXQ_ETH];
/*
@@ -1326,13 +1323,12 @@
struct sk_buff *skb;
struct sge_qset *qs = (struct sge_qset *)data;
struct sge_txq *q = &qs->txq[TXQ_CTRL];
- const struct port_info *pi = netdev_priv(qs->netdev);
- struct adapter *adap = pi->adapter;
spin_lock(&q->lock);
again:reclaim_completed_tx_imm(q);
- while (q->in_use < q->size && (skb = __skb_dequeue(&q->sendq)) != NULL) {
+ while (q->in_use < q->size &&
+ (skb = __skb_dequeue(&q->sendq)) != NULL) {
write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
@@ -1354,7 +1350,7 @@
}
spin_unlock(&q->lock);
- t3_write_reg(adap, A_SG_KDOORBELL,
+ t3_write_reg(qs->adap, A_SG_KDOORBELL,
F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
}
@@ -1638,8 +1634,7 @@
else {
struct sge_qset *qs = rspq_to_qset(q);
- if (__netif_rx_schedule_prep(qs->netdev))
- __netif_rx_schedule(qs->netdev);
+ napi_schedule(&qs->napi);
q->rx_head = skb;
}
q->rx_tail = skb;
@@ -1675,34 +1670,30 @@
* receive handler. Batches need to be of modest size as we do prefetches
* on the packets in each.
*/
-static int ofld_poll(struct net_device *dev, int *budget)
+static int ofld_poll(struct napi_struct *napi, int budget)
{
- const struct port_info *pi = netdev_priv(dev);
- struct adapter *adapter = pi->adapter;
- struct sge_qset *qs = dev2qset(dev);
+ struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
struct sge_rspq *q = &qs->rspq;
- int work_done, limit = min(*budget, dev->quota), avail = limit;
+ struct adapter *adapter = qs->adap;
+ int work_done = 0;
- while (avail) {
+ while (work_done < budget) {
struct sk_buff *head, *tail, *skbs[RX_BUNDLE_SIZE];
int ngathered;
spin_lock_irq(&q->lock);
head = q->rx_head;
if (!head) {
- work_done = limit - avail;
- *budget -= work_done;
- dev->quota -= work_done;
- __netif_rx_complete(dev);
+ napi_complete(napi);
spin_unlock_irq(&q->lock);
- return 0;
+ return work_done;
}
tail = q->rx_tail;
q->rx_head = q->rx_tail = NULL;
spin_unlock_irq(&q->lock);
- for (ngathered = 0; avail && head; avail--) {
+ for (ngathered = 0; work_done < budget && head; work_done++) {
prefetch(head->data);
skbs[ngathered] = head;
head = head->next;
@@ -1724,10 +1715,8 @@
}
deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
}
- work_done = limit - avail;
- *budget -= work_done;
- dev->quota -= work_done;
- return 1;
+
+ return work_done;
}
/**
@@ -2071,50 +2060,47 @@
/**
* napi_rx_handler - the NAPI handler for Rx processing
- * @dev: the net device
+ * @napi: the napi instance
* @budget: how many packets we can process in this round
*
* Handler for new data events when using NAPI.
*/
-static int napi_rx_handler(struct net_device *dev, int *budget)
+static int napi_rx_handler(struct napi_struct *napi, int budget)
{
- const struct port_info *pi = netdev_priv(dev);
- struct adapter *adap = pi->adapter;
- struct sge_qset *qs = dev2qset(dev);
- int effective_budget = min(*budget, dev->quota);
+ struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
+ struct adapter *adap = qs->adap;
+ int work_done = process_responses(adap, qs, budget);
- int work_done = process_responses(adap, qs, effective_budget);
- *budget -= work_done;
- dev->quota -= work_done;
+ if (likely(work_done < budget)) {
+ napi_complete(napi);
- if (work_done >= effective_budget)
- return 1;
-
- netif_rx_complete(dev);
-
- /*
- * Because we don't atomically flush the following write it is
- * possible that in very rare cases it can reach the device in a way
- * that races with a new response being written plus an error interrupt
- * causing the NAPI interrupt handler below to return unhandled status
- * to the OS. To protect against this would require flushing the write
- * and doing both the write and the flush with interrupts off. Way too
- * expensive and unjustifiable given the rarity of the race.
- *
- * The race cannot happen at all with MSI-X.
- */
- t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
- V_NEWTIMER(qs->rspq.next_holdoff) |
- V_NEWINDEX(qs->rspq.cidx));
- return 0;
+ /*
+ * Because we don't atomically flush the following
+ * write it is possible that in very rare cases it can
+ * reach the device in a way that races with a new
+ * response being written plus an error interrupt
+ * causing the NAPI interrupt handler below to return
+ * unhandled status to the OS. To protect against
+ * this would require flushing the write and doing
+ * both the write and the flush with interrupts off.
+ * Way too expensive and unjustifiable given the
+ * rarity of the race.
+ *
+ * The race cannot happen at all with MSI-X.
+ */
+ t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
+ V_NEWTIMER(qs->rspq.next_holdoff) |
+ V_NEWINDEX(qs->rspq.cidx));
+ }
+ return work_done;
}
/*
* Returns true if the device is already scheduled for polling.
*/
-static inline int napi_is_scheduled(struct net_device *dev)
+static inline int napi_is_scheduled(struct napi_struct *napi)
{
- return test_bit(__LINK_STATE_RX_SCHED, &dev->state);
+ return test_bit(NAPI_STATE_SCHED, &napi->state);
}
/**
@@ -2197,8 +2183,7 @@
V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
return 0;
}
- if (likely(__netif_rx_schedule_prep(qs->netdev)))
- __netif_rx_schedule(qs->netdev);
+ napi_schedule(&qs->napi);
return 1;
}
@@ -2209,8 +2194,7 @@
irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
{
struct sge_qset *qs = cookie;
- const struct port_info *pi = netdev_priv(qs->netdev);
- struct adapter *adap = pi->adapter;
+ struct adapter *adap = qs->adap;
struct sge_rspq *q = &qs->rspq;
spin_lock(&q->lock);
@@ -2229,13 +2213,11 @@
irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
{
struct sge_qset *qs = cookie;
- const struct port_info *pi = netdev_priv(qs->netdev);
- struct adapter *adap = pi->adapter;
struct sge_rspq *q = &qs->rspq;
spin_lock(&q->lock);
- if (handle_responses(adap, q) < 0)
+ if (handle_responses(qs->adap, q) < 0)
q->unhandled_irqs++;
spin_unlock(&q->lock);
return IRQ_HANDLED;
@@ -2278,11 +2260,13 @@
return IRQ_HANDLED;
}
-static int rspq_check_napi(struct net_device *dev, struct sge_rspq *q)
+static int rspq_check_napi(struct sge_qset *qs)
{
- if (!napi_is_scheduled(dev) && is_new_response(&q->desc[q->cidx], q)) {
- if (likely(__netif_rx_schedule_prep(dev)))
- __netif_rx_schedule(dev);
+ struct sge_rspq *q = &qs->rspq;
+
+ if (!napi_is_scheduled(&qs->napi) &&
+ is_new_response(&q->desc[q->cidx], q)) {
+ napi_schedule(&qs->napi);
return 1;
}
return 0;
@@ -2303,10 +2287,9 @@
spin_lock(&q->lock);
- new_packets = rspq_check_napi(adap->sge.qs[0].netdev, q);
+ new_packets = rspq_check_napi(&adap->sge.qs[0]);
if (adap->params.nports == 2)
- new_packets += rspq_check_napi(adap->sge.qs[1].netdev,
- &adap->sge.qs[1].rspq);
+ new_packets += rspq_check_napi(&adap->sge.qs[1]);
if (!new_packets && t3_slow_intr_handler(adap) == 0)
q->unhandled_irqs++;
@@ -2409,9 +2392,9 @@
static irqreturn_t t3b_intr_napi(int irq, void *cookie)
{
u32 map;
- struct net_device *dev;
struct adapter *adap = cookie;
- struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
+ struct sge_qset *qs0 = &adap->sge.qs[0];
+ struct sge_rspq *q0 = &qs0->rspq;
t3_write_reg(adap, A_PL_CLI, 0);
map = t3_read_reg(adap, A_SG_DATA_INTR);
@@ -2424,18 +2407,11 @@
if (unlikely(map & F_ERRINTR))
t3_slow_intr_handler(adap);
- if (likely(map & 1)) {
- dev = adap->sge.qs[0].netdev;
+ if (likely(map & 1))
+ napi_schedule(&qs0->napi);
- if (likely(__netif_rx_schedule_prep(dev)))
- __netif_rx_schedule(dev);
- }
- if (map & 2) {
- dev = adap->sge.qs[1].netdev;
-
- if (likely(__netif_rx_schedule_prep(dev)))
- __netif_rx_schedule(dev);
- }
+ if (map & 2)
+ napi_schedule(&adap->sge.qs[1].napi);
spin_unlock(&q0->lock);
return IRQ_HANDLED;
@@ -2514,8 +2490,7 @@
{
spinlock_t *lock;
struct sge_qset *qs = (struct sge_qset *)data;
- const struct port_info *pi = netdev_priv(qs->netdev);
- struct adapter *adap = pi->adapter;
+ struct adapter *adap = qs->adap;
if (spin_trylock(&qs->txq[TXQ_ETH].lock)) {
reclaim_completed_tx(adap, &qs->txq[TXQ_ETH]);
@@ -2526,9 +2501,9 @@
spin_unlock(&qs->txq[TXQ_OFLD].lock);
}
lock = (adap->flags & USING_MSIX) ? &qs->rspq.lock :
- &adap->sge.qs[0].rspq.lock;
+ &adap->sge.qs[0].rspq.lock;
if (spin_trylock_irq(lock)) {
- if (!napi_is_scheduled(qs->netdev)) {
+ if (!napi_is_scheduled(&qs->napi)) {
u32 status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
if (qs->fl[0].credits < qs->fl[0].size)
@@ -2562,12 +2537,9 @@
*/
void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
{
- if (!qs->netdev)
- return;
-
qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
qs->rspq.polling = p->polling;
- qs->netdev->poll = p->polling ? napi_rx_handler : ofld_poll;
+ qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
}
/**
@@ -2587,7 +2559,7 @@
*/
int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
int irq_vec_idx, const struct qset_params *p,
- int ntxq, struct net_device *netdev)
+ int ntxq, struct net_device *dev)
{
int i, ret = -ENOMEM;
struct sge_qset *q = &adapter->sge.qs[id];
@@ -2708,16 +2680,10 @@
}
spin_unlock(&adapter->sge.reg_lock);
- q->netdev = netdev;
- t3_update_qset_coalesce(q, p);
- /*
- * We use atalk_ptr as a backpointer to a qset. In case a device is
- * associated with multiple queue sets only the first one sets
- * atalk_ptr.
- */
- if (netdev->atalk_ptr == NULL)
- netdev->atalk_ptr = q;
+ q->adap = adapter;
+ q->netdev = dev;
+ t3_update_qset_coalesce(q, p);
refill_fl(adapter, &q->fl[0], q->fl[0].size, GFP_KERNEL);
refill_fl(adapter, &q->fl[1], q->fl[1].size, GFP_KERNEL);