Jesper Dangaard Brouer | ff7d6b2 | 2018-04-17 16:46:17 +0200 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 |
| 2 | * |
| 3 | * page_pool.c |
| 4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> |
| 5 | * Copyright (C) 2016 Red Hat, Inc. |
| 6 | */ |
| 7 | #include <linux/types.h> |
| 8 | #include <linux/kernel.h> |
| 9 | #include <linux/slab.h> |
| 10 | |
| 11 | #include <net/page_pool.h> |
| 12 | #include <linux/dma-direction.h> |
| 13 | #include <linux/dma-mapping.h> |
| 14 | #include <linux/page-flags.h> |
| 15 | #include <linux/mm.h> /* for __put_page() */ |
| 16 | |
| 17 | static int page_pool_init(struct page_pool *pool, |
| 18 | const struct page_pool_params *params) |
| 19 | { |
| 20 | unsigned int ring_qsize = 1024; /* Default */ |
| 21 | |
| 22 | memcpy(&pool->p, params, sizeof(pool->p)); |
| 23 | |
| 24 | /* Validate only known flags were used */ |
| 25 | if (pool->p.flags & ~(PP_FLAG_ALL)) |
| 26 | return -EINVAL; |
| 27 | |
| 28 | if (pool->p.pool_size) |
| 29 | ring_qsize = pool->p.pool_size; |
| 30 | |
| 31 | /* Sanity limit mem that can be pinned down */ |
| 32 | if (ring_qsize > 32768) |
| 33 | return -E2BIG; |
| 34 | |
| 35 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. |
| 36 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, |
| 37 | * which is the XDP_TX use-case. |
| 38 | */ |
| 39 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && |
| 40 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) |
| 41 | return -EINVAL; |
| 42 | |
| 43 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) |
| 44 | return -ENOMEM; |
| 45 | |
| 46 | return 0; |
| 47 | } |
| 48 | |
| 49 | struct page_pool *page_pool_create(const struct page_pool_params *params) |
| 50 | { |
| 51 | struct page_pool *pool; |
| 52 | int err = 0; |
| 53 | |
| 54 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); |
| 55 | if (!pool) |
| 56 | return ERR_PTR(-ENOMEM); |
| 57 | |
| 58 | err = page_pool_init(pool, params); |
| 59 | if (err < 0) { |
| 60 | pr_warn("%s() gave up with errno %d\n", __func__, err); |
| 61 | kfree(pool); |
| 62 | return ERR_PTR(err); |
| 63 | } |
| 64 | return pool; |
| 65 | } |
| 66 | EXPORT_SYMBOL(page_pool_create); |
| 67 | |
| 68 | /* fast path */ |
| 69 | static struct page *__page_pool_get_cached(struct page_pool *pool) |
| 70 | { |
| 71 | struct ptr_ring *r = &pool->ring; |
| 72 | struct page *page; |
| 73 | |
| 74 | /* Quicker fallback, avoid locks when ring is empty */ |
| 75 | if (__ptr_ring_empty(r)) |
| 76 | return NULL; |
| 77 | |
| 78 | /* Test for safe-context, caller should provide this guarantee */ |
| 79 | if (likely(in_serving_softirq())) { |
| 80 | if (likely(pool->alloc.count)) { |
| 81 | /* Fast-path */ |
| 82 | page = pool->alloc.cache[--pool->alloc.count]; |
| 83 | return page; |
| 84 | } |
| 85 | /* Slower-path: Alloc array empty, time to refill |
| 86 | * |
| 87 | * Open-coded bulk ptr_ring consumer. |
| 88 | * |
| 89 | * Discussion: the ring consumer lock is not really |
| 90 | * needed due to the softirq/NAPI protection, but |
| 91 | * later need the ability to reclaim pages on the |
| 92 | * ring. Thus, keeping the locks. |
| 93 | */ |
| 94 | spin_lock(&r->consumer_lock); |
| 95 | while ((page = __ptr_ring_consume(r))) { |
| 96 | if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) |
| 97 | break; |
| 98 | pool->alloc.cache[pool->alloc.count++] = page; |
| 99 | } |
| 100 | spin_unlock(&r->consumer_lock); |
| 101 | return page; |
| 102 | } |
| 103 | |
| 104 | /* Slow-path: Get page from locked ring queue */ |
| 105 | page = ptr_ring_consume(&pool->ring); |
| 106 | return page; |
| 107 | } |
| 108 | |
| 109 | /* slow path */ |
| 110 | noinline |
| 111 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, |
| 112 | gfp_t _gfp) |
| 113 | { |
| 114 | struct page *page; |
| 115 | gfp_t gfp = _gfp; |
| 116 | dma_addr_t dma; |
| 117 | |
| 118 | /* We could always set __GFP_COMP, and avoid this branch, as |
| 119 | * prep_new_page() can handle order-0 with __GFP_COMP. |
| 120 | */ |
| 121 | if (pool->p.order) |
| 122 | gfp |= __GFP_COMP; |
| 123 | |
| 124 | /* FUTURE development: |
| 125 | * |
| 126 | * Current slow-path essentially falls back to single page |
| 127 | * allocations, which doesn't improve performance. This code |
| 128 | * need bulk allocation support from the page allocator code. |
| 129 | */ |
| 130 | |
| 131 | /* Cache was empty, do real allocation */ |
| 132 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); |
| 133 | if (!page) |
| 134 | return NULL; |
| 135 | |
| 136 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
| 137 | goto skip_dma_map; |
| 138 | |
| 139 | /* Setup DMA mapping: use page->private for DMA-addr |
| 140 | * This mapping is kept for lifetime of page, until leaving pool. |
| 141 | */ |
| 142 | dma = dma_map_page(pool->p.dev, page, 0, |
| 143 | (PAGE_SIZE << pool->p.order), |
| 144 | pool->p.dma_dir); |
| 145 | if (dma_mapping_error(pool->p.dev, dma)) { |
| 146 | put_page(page); |
| 147 | return NULL; |
| 148 | } |
| 149 | set_page_private(page, dma); /* page->private = dma; */ |
| 150 | |
| 151 | skip_dma_map: |
| 152 | /* When page just alloc'ed is should/must have refcnt 1. */ |
| 153 | return page; |
| 154 | } |
| 155 | |
| 156 | /* For using page_pool replace: alloc_pages() API calls, but provide |
| 157 | * synchronization guarantee for allocation side. |
| 158 | */ |
| 159 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) |
| 160 | { |
| 161 | struct page *page; |
| 162 | |
| 163 | /* Fast-path: Get a page from cache */ |
| 164 | page = __page_pool_get_cached(pool); |
| 165 | if (page) |
| 166 | return page; |
| 167 | |
| 168 | /* Slow-path: cache empty, do real allocation */ |
| 169 | page = __page_pool_alloc_pages_slow(pool, gfp); |
| 170 | return page; |
| 171 | } |
| 172 | EXPORT_SYMBOL(page_pool_alloc_pages); |
| 173 | |
| 174 | /* Cleanup page_pool state from page */ |
| 175 | static void __page_pool_clean_page(struct page_pool *pool, |
| 176 | struct page *page) |
| 177 | { |
| 178 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
| 179 | return; |
| 180 | |
| 181 | /* DMA unmap */ |
| 182 | dma_unmap_page(pool->p.dev, page_private(page), |
| 183 | PAGE_SIZE << pool->p.order, pool->p.dma_dir); |
| 184 | set_page_private(page, 0); |
| 185 | } |
| 186 | |
| 187 | /* Return a page to the page allocator, cleaning up our state */ |
| 188 | static void __page_pool_return_page(struct page_pool *pool, struct page *page) |
| 189 | { |
| 190 | __page_pool_clean_page(pool, page); |
| 191 | put_page(page); |
| 192 | /* An optimization would be to call __free_pages(page, pool->p.order) |
| 193 | * knowing page is not part of page-cache (thus avoiding a |
| 194 | * __page_cache_release() call). |
| 195 | */ |
| 196 | } |
| 197 | |
| 198 | static bool __page_pool_recycle_into_ring(struct page_pool *pool, |
| 199 | struct page *page) |
| 200 | { |
| 201 | int ret; |
| 202 | /* BH protection not needed if current is serving softirq */ |
| 203 | if (in_serving_softirq()) |
| 204 | ret = ptr_ring_produce(&pool->ring, page); |
| 205 | else |
| 206 | ret = ptr_ring_produce_bh(&pool->ring, page); |
| 207 | |
| 208 | return (ret == 0) ? true : false; |
| 209 | } |
| 210 | |
| 211 | /* Only allow direct recycling in special circumstances, into the |
| 212 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. |
| 213 | * |
| 214 | * Caller must provide appropriate safe context. |
| 215 | */ |
| 216 | static bool __page_pool_recycle_direct(struct page *page, |
| 217 | struct page_pool *pool) |
| 218 | { |
| 219 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) |
| 220 | return false; |
| 221 | |
| 222 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ |
| 223 | pool->alloc.cache[pool->alloc.count++] = page; |
| 224 | return true; |
| 225 | } |
| 226 | |
| 227 | void __page_pool_put_page(struct page_pool *pool, |
| 228 | struct page *page, bool allow_direct) |
| 229 | { |
| 230 | /* This allocator is optimized for the XDP mode that uses |
| 231 | * one-frame-per-page, but have fallbacks that act like the |
| 232 | * regular page allocator APIs. |
| 233 | * |
| 234 | * refcnt == 1 means page_pool owns page, and can recycle it. |
| 235 | */ |
| 236 | if (likely(page_ref_count(page) == 1)) { |
| 237 | /* Read barrier done in page_ref_count / READ_ONCE */ |
| 238 | |
| 239 | if (allow_direct && in_serving_softirq()) |
| 240 | if (__page_pool_recycle_direct(page, pool)) |
| 241 | return; |
| 242 | |
| 243 | if (!__page_pool_recycle_into_ring(pool, page)) { |
| 244 | /* Cache full, fallback to free pages */ |
| 245 | __page_pool_return_page(pool, page); |
| 246 | } |
| 247 | return; |
| 248 | } |
| 249 | /* Fallback/non-XDP mode: API user have elevated refcnt. |
| 250 | * |
| 251 | * Many drivers split up the page into fragments, and some |
| 252 | * want to keep doing this to save memory and do refcnt based |
| 253 | * recycling. Support this use case too, to ease drivers |
| 254 | * switching between XDP/non-XDP. |
| 255 | * |
| 256 | * In-case page_pool maintains the DMA mapping, API user must |
| 257 | * call page_pool_put_page once. In this elevated refcnt |
| 258 | * case, the DMA is unmapped/released, as driver is likely |
| 259 | * doing refcnt based recycle tricks, meaning another process |
| 260 | * will be invoking put_page. |
| 261 | */ |
| 262 | __page_pool_clean_page(pool, page); |
| 263 | put_page(page); |
| 264 | } |
| 265 | EXPORT_SYMBOL(__page_pool_put_page); |
| 266 | |
| 267 | static void __page_pool_empty_ring(struct page_pool *pool) |
| 268 | { |
| 269 | struct page *page; |
| 270 | |
| 271 | /* Empty recycle ring */ |
Tariq Toukan | 4905bd9 | 2018-07-17 18:10:37 +0300 | [diff] [blame] | 272 | while ((page = ptr_ring_consume_bh(&pool->ring))) { |
Jesper Dangaard Brouer | ff7d6b2 | 2018-04-17 16:46:17 +0200 | [diff] [blame] | 273 | /* Verify the refcnt invariant of cached pages */ |
| 274 | if (!(page_ref_count(page) == 1)) |
| 275 | pr_crit("%s() page_pool refcnt %d violation\n", |
| 276 | __func__, page_ref_count(page)); |
| 277 | |
| 278 | __page_pool_return_page(pool, page); |
| 279 | } |
| 280 | } |
| 281 | |
| 282 | static void __page_pool_destroy_rcu(struct rcu_head *rcu) |
| 283 | { |
| 284 | struct page_pool *pool; |
| 285 | |
| 286 | pool = container_of(rcu, struct page_pool, rcu); |
| 287 | |
| 288 | WARN(pool->alloc.count, "API usage violation"); |
| 289 | |
| 290 | __page_pool_empty_ring(pool); |
| 291 | ptr_ring_cleanup(&pool->ring, NULL); |
| 292 | kfree(pool); |
| 293 | } |
| 294 | |
| 295 | /* Cleanup and release resources */ |
| 296 | void page_pool_destroy(struct page_pool *pool) |
| 297 | { |
| 298 | struct page *page; |
| 299 | |
| 300 | /* Empty alloc cache, assume caller made sure this is |
| 301 | * no-longer in use, and page_pool_alloc_pages() cannot be |
| 302 | * call concurrently. |
| 303 | */ |
| 304 | while (pool->alloc.count) { |
| 305 | page = pool->alloc.cache[--pool->alloc.count]; |
| 306 | __page_pool_return_page(pool, page); |
| 307 | } |
| 308 | |
| 309 | /* No more consumers should exist, but producers could still |
| 310 | * be in-flight. |
| 311 | */ |
| 312 | __page_pool_empty_ring(pool); |
| 313 | |
| 314 | /* An xdp_mem_allocator can still ref page_pool pointer */ |
| 315 | call_rcu(&pool->rcu, __page_pool_destroy_rcu); |
| 316 | } |
| 317 | EXPORT_SYMBOL(page_pool_destroy); |