Merge branch 'net-mitigate-kmem_free-slowpath'
Jesper Dangaard Brouer says:
====================
net: mitigating kmem_cache free slowpath
This patchset is the first real use-case for kmem_cache bulk _free_.
The use of bulk _alloc_ is NOT included in this patchset. The full use
have previously been posted here [1].
The bulk free side have the largest benefit for the network stack
use-case, because network stack is hitting the kmem_cache/SLUB
slowpath when freeing SKBs, due to the amount of outstanding SKBs.
This is solved by using the new API kmem_cache_free_bulk().
Introduce new API napi_consume_skb(), that hides/handles bulk freeing
for the caller. The drivers simply need to use this call when freeing
SKBs in NAPI context, e.g. replacing their calles to dev_kfree_skb() /
dev_consume_skb_any().
Driver ixgbe is the first user of this new API.
[1] http://thread.gmane.org/gmane.linux.network/384302/focus=397373
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c4003a8..0c701b8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1089,7 +1089,7 @@
* @tx_ring: tx ring to clean
**/
static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
- struct ixgbe_ring *tx_ring)
+ struct ixgbe_ring *tx_ring, int napi_budget)
{
struct ixgbe_adapter *adapter = q_vector->adapter;
struct ixgbe_tx_buffer *tx_buffer;
@@ -1127,7 +1127,7 @@
total_packets += tx_buffer->gso_segs;
/* free the skb */
- dev_consume_skb_any(tx_buffer->skb);
+ napi_consume_skb(tx_buffer->skb, napi_budget);
/* unmap skb header data */
dma_unmap_single(tx_ring->dev,
@@ -2784,7 +2784,7 @@
#endif
ixgbe_for_each_ring(ring, q_vector->tx)
- clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring);
+ clean_complete &= !!ixgbe_clean_tx_irq(q_vector, ring, budget);
/* Exit if we are called by netpoll or busy polling is active */
if ((budget <= 0) || !ixgbe_qv_lock_napi(q_vector))
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a8fc222..6ec86f1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2404,6 +2404,10 @@
{
return __napi_alloc_skb(napi, length, GFP_ATOMIC);
}
+void napi_consume_skb(struct sk_buff *skb, int budget);
+
+void __kfree_skb_flush(void);
+void __kfree_skb_defer(struct sk_buff *skb);
/**
* __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index f128483..3f4071a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3829,8 +3829,14 @@
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
- __kfree_skb(skb);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
}
+
+ __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -5155,6 +5161,7 @@
}
}
+ __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b0cce74..a5bd067 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,8 +347,16 @@
}
EXPORT_SYMBOL(build_skb);
+#define NAPI_SKB_CACHE_SIZE 64
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ size_t skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -378,9 +386,9 @@
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(nc, fragsz, gfp_mask);
+ return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -474,7 +482,7 @@
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
void *data;
@@ -494,7 +502,7 @@
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = __alloc_page_frag(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -505,7 +513,7 @@
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->pfmemalloc)
+ if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -747,6 +755,73 @@
}
EXPORT_SYMBOL(consume_skb);
+void __kfree_skb_flush(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* flush skb_cache if containing objects */
+ if (nc->skb_count) {
+ kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+
+static inline void _kfree_skb_defer(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* drop skb->head and call any destructors for packet */
+ skb_release_all(skb);
+
+ /* record skb to CPU local list */
+ nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+ /* SLUB writes into objects when freeing */
+ prefetchw(skb);
+#endif
+
+ /* flush skb_cache if it is filled */
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+ _kfree_skb_defer(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ if (unlikely(!skb))
+ return;
+
+ /* if budget is 0 assume netpoll w/ IRQs disabled */
+ if (unlikely(!budget)) {
+ dev_consume_skb_irq(skb);
+ return;
+ }
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb);
+
+ /* if SKB is a clone, don't handle this case */
+ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ _kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) < \