Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 1 | /* |
| 2 | * mm/percpu-vm.c - vmalloc area based chunk allocation |
| 3 | * |
| 4 | * Copyright (C) 2010 SUSE Linux Products GmbH |
| 5 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| 6 | * |
| 7 | * This file is released under the GPLv2. |
| 8 | * |
| 9 | * Chunks are mapped into vmalloc areas and populated page by page. |
| 10 | * This is the default chunk allocator. |
| 11 | */ |
| 12 | |
| 13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, |
| 14 | unsigned int cpu, int page_idx) |
| 15 | { |
| 16 | /* must not be used on pre-mapped chunk */ |
| 17 | WARN_ON(chunk->immutable); |
| 18 | |
| 19 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); |
| 20 | } |
| 21 | |
| 22 | /** |
| 23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap |
| 24 | * @chunk: chunk of interest |
| 25 | * @bitmapp: output parameter for bitmap |
| 26 | * @may_alloc: may allocate the array |
| 27 | * |
| 28 | * Returns pointer to array of pointers to struct page and bitmap, |
| 29 | * both of which can be indexed with pcpu_page_idx(). The returned |
| 30 | * array is cleared to zero and *@bitmapp is copied from |
| 31 | * @chunk->populated. Note that there is only one array and bitmap |
| 32 | * and access exclusion is the caller's responsibility. |
| 33 | * |
| 34 | * CONTEXT: |
| 35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. |
| 36 | * Otherwise, don't care. |
| 37 | * |
| 38 | * RETURNS: |
| 39 | * Pointer to temp pages array on success, NULL on failure. |
| 40 | */ |
| 41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, |
| 42 | unsigned long **bitmapp, |
| 43 | bool may_alloc) |
| 44 | { |
| 45 | static struct page **pages; |
| 46 | static unsigned long *bitmap; |
| 47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
| 48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * |
| 49 | sizeof(unsigned long); |
| 50 | |
| 51 | if (!pages || !bitmap) { |
| 52 | if (may_alloc && !pages) |
| 53 | pages = pcpu_mem_alloc(pages_size); |
| 54 | if (may_alloc && !bitmap) |
| 55 | bitmap = pcpu_mem_alloc(bitmap_size); |
| 56 | if (!pages || !bitmap) |
| 57 | return NULL; |
| 58 | } |
| 59 | |
| 60 | memset(pages, 0, pages_size); |
| 61 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); |
| 62 | |
| 63 | *bitmapp = bitmap; |
| 64 | return pages; |
| 65 | } |
| 66 | |
| 67 | /** |
| 68 | * pcpu_free_pages - free pages which were allocated for @chunk |
| 69 | * @chunk: chunk pages were allocated for |
| 70 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
| 71 | * @populated: populated bitmap |
| 72 | * @page_start: page index of the first page to be freed |
| 73 | * @page_end: page index of the last page to be freed + 1 |
| 74 | * |
| 75 | * Free pages [@page_start and @page_end) in @pages for all units. |
| 76 | * The pages were allocated for @chunk. |
| 77 | */ |
| 78 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
| 79 | struct page **pages, unsigned long *populated, |
| 80 | int page_start, int page_end) |
| 81 | { |
| 82 | unsigned int cpu; |
| 83 | int i; |
| 84 | |
| 85 | for_each_possible_cpu(cpu) { |
| 86 | for (i = page_start; i < page_end; i++) { |
| 87 | struct page *page = pages[pcpu_page_idx(cpu, i)]; |
| 88 | |
| 89 | if (page) |
| 90 | __free_page(page); |
| 91 | } |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | /** |
| 96 | * pcpu_alloc_pages - allocates pages for @chunk |
| 97 | * @chunk: target chunk |
| 98 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
| 99 | * @populated: populated bitmap |
| 100 | * @page_start: page index of the first page to be allocated |
| 101 | * @page_end: page index of the last page to be allocated + 1 |
| 102 | * |
| 103 | * Allocate pages [@page_start,@page_end) into @pages for all units. |
| 104 | * The allocation is for @chunk. Percpu core doesn't care about the |
| 105 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
| 106 | */ |
| 107 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
| 108 | struct page **pages, unsigned long *populated, |
| 109 | int page_start, int page_end) |
| 110 | { |
| 111 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
| 112 | unsigned int cpu; |
| 113 | int i; |
| 114 | |
| 115 | for_each_possible_cpu(cpu) { |
| 116 | for (i = page_start; i < page_end; i++) { |
| 117 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
| 118 | |
| 119 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); |
| 120 | if (!*pagep) { |
| 121 | pcpu_free_pages(chunk, pages, populated, |
| 122 | page_start, page_end); |
| 123 | return -ENOMEM; |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | return 0; |
| 128 | } |
| 129 | |
| 130 | /** |
| 131 | * pcpu_pre_unmap_flush - flush cache prior to unmapping |
| 132 | * @chunk: chunk the regions to be flushed belongs to |
| 133 | * @page_start: page index of the first page to be flushed |
| 134 | * @page_end: page index of the last page to be flushed + 1 |
| 135 | * |
| 136 | * Pages in [@page_start,@page_end) of @chunk are about to be |
| 137 | * unmapped. Flush cache. As each flushing trial can be very |
| 138 | * expensive, issue flush on the whole region at once rather than |
| 139 | * doing it for each cpu. This could be an overkill but is more |
| 140 | * scalable. |
| 141 | */ |
| 142 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, |
| 143 | int page_start, int page_end) |
| 144 | { |
| 145 | flush_cache_vunmap( |
| 146 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), |
| 147 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
| 148 | } |
| 149 | |
| 150 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
| 151 | { |
| 152 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); |
| 153 | } |
| 154 | |
| 155 | /** |
| 156 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
| 157 | * @chunk: chunk of interest |
| 158 | * @pages: pages array which can be used to pass information to free |
| 159 | * @populated: populated bitmap |
| 160 | * @page_start: page index of the first page to unmap |
| 161 | * @page_end: page index of the last page to unmap + 1 |
| 162 | * |
| 163 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. |
| 164 | * Corresponding elements in @pages were cleared by the caller and can |
| 165 | * be used to carry information to pcpu_free_pages() which will be |
| 166 | * called after all unmaps are finished. The caller should call |
| 167 | * proper pre/post flush functions. |
| 168 | */ |
| 169 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
| 170 | struct page **pages, unsigned long *populated, |
| 171 | int page_start, int page_end) |
| 172 | { |
| 173 | unsigned int cpu; |
| 174 | int i; |
| 175 | |
| 176 | for_each_possible_cpu(cpu) { |
| 177 | for (i = page_start; i < page_end; i++) { |
| 178 | struct page *page; |
| 179 | |
| 180 | page = pcpu_chunk_page(chunk, cpu, i); |
| 181 | WARN_ON(!page); |
| 182 | pages[pcpu_page_idx(cpu, i)] = page; |
| 183 | } |
| 184 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 185 | page_end - page_start); |
| 186 | } |
| 187 | |
| 188 | for (i = page_start; i < page_end; i++) |
| 189 | __clear_bit(i, populated); |
| 190 | } |
| 191 | |
| 192 | /** |
| 193 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping |
| 194 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 195 | * @page_start: page index of the first page to be flushed |
| 196 | * @page_end: page index of the last page to be flushed + 1 |
| 197 | * |
| 198 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush |
| 199 | * TLB for the regions. This can be skipped if the area is to be |
| 200 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. |
| 201 | * |
| 202 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 203 | * for the whole region. |
| 204 | */ |
| 205 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, |
| 206 | int page_start, int page_end) |
| 207 | { |
| 208 | flush_tlb_kernel_range( |
| 209 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), |
| 210 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
| 211 | } |
| 212 | |
| 213 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
| 214 | int nr_pages) |
| 215 | { |
| 216 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, |
| 217 | PAGE_KERNEL, pages); |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * pcpu_map_pages - map pages into a pcpu_chunk |
| 222 | * @chunk: chunk of interest |
| 223 | * @pages: pages array containing pages to be mapped |
| 224 | * @populated: populated bitmap |
| 225 | * @page_start: page index of the first page to map |
| 226 | * @page_end: page index of the last page to map + 1 |
| 227 | * |
| 228 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The |
| 229 | * caller is responsible for calling pcpu_post_map_flush() after all |
| 230 | * mappings are complete. |
| 231 | * |
| 232 | * This function is responsible for setting corresponding bits in |
| 233 | * @chunk->populated bitmap and whatever is necessary for reverse |
| 234 | * lookup (addr -> chunk). |
| 235 | */ |
| 236 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
| 237 | struct page **pages, unsigned long *populated, |
| 238 | int page_start, int page_end) |
| 239 | { |
| 240 | unsigned int cpu, tcpu; |
| 241 | int i, err; |
| 242 | |
| 243 | for_each_possible_cpu(cpu) { |
| 244 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 245 | &pages[pcpu_page_idx(cpu, page_start)], |
| 246 | page_end - page_start); |
| 247 | if (err < 0) |
| 248 | goto err; |
| 249 | } |
| 250 | |
| 251 | /* mapping successful, link chunk and mark populated */ |
| 252 | for (i = page_start; i < page_end; i++) { |
| 253 | for_each_possible_cpu(cpu) |
| 254 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
| 255 | chunk); |
| 256 | __set_bit(i, populated); |
| 257 | } |
| 258 | |
| 259 | return 0; |
| 260 | |
| 261 | err: |
| 262 | for_each_possible_cpu(tcpu) { |
| 263 | if (tcpu == cpu) |
| 264 | break; |
| 265 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), |
| 266 | page_end - page_start); |
| 267 | } |
| 268 | return err; |
| 269 | } |
| 270 | |
| 271 | /** |
| 272 | * pcpu_post_map_flush - flush cache after mapping |
| 273 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 274 | * @page_start: page index of the first page to be flushed |
| 275 | * @page_end: page index of the last page to be flushed + 1 |
| 276 | * |
| 277 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush |
| 278 | * cache. |
| 279 | * |
| 280 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 281 | * for the whole region. |
| 282 | */ |
| 283 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, |
| 284 | int page_start, int page_end) |
| 285 | { |
| 286 | flush_cache_vmap( |
| 287 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), |
| 288 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); |
| 289 | } |
| 290 | |
| 291 | /** |
| 292 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
| 293 | * @chunk: chunk of interest |
| 294 | * @off: offset to the area to populate |
| 295 | * @size: size of the area to populate in bytes |
| 296 | * |
| 297 | * For each cpu, populate and map pages [@page_start,@page_end) into |
| 298 | * @chunk. The area is cleared on return. |
| 299 | * |
| 300 | * CONTEXT: |
| 301 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
| 302 | */ |
| 303 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 304 | { |
| 305 | int page_start = PFN_DOWN(off); |
| 306 | int page_end = PFN_UP(off + size); |
| 307 | int free_end = page_start, unmap_end = page_start; |
| 308 | struct page **pages; |
| 309 | unsigned long *populated; |
| 310 | unsigned int cpu; |
| 311 | int rs, re, rc; |
| 312 | |
| 313 | /* quick path, check whether all pages are already there */ |
| 314 | rs = page_start; |
| 315 | pcpu_next_pop(chunk, &rs, &re, page_end); |
| 316 | if (rs == page_start && re == page_end) |
| 317 | goto clear; |
| 318 | |
| 319 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 320 | WARN_ON(chunk->immutable); |
| 321 | |
| 322 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); |
| 323 | if (!pages) |
| 324 | return -ENOMEM; |
| 325 | |
| 326 | /* alloc and map */ |
| 327 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 328 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); |
| 329 | if (rc) |
| 330 | goto err_free; |
| 331 | free_end = re; |
| 332 | } |
| 333 | |
| 334 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 335 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); |
| 336 | if (rc) |
| 337 | goto err_unmap; |
| 338 | unmap_end = re; |
| 339 | } |
| 340 | pcpu_post_map_flush(chunk, page_start, page_end); |
| 341 | |
| 342 | /* commit new bitmap */ |
| 343 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 344 | clear: |
| 345 | for_each_possible_cpu(cpu) |
| 346 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
| 347 | return 0; |
| 348 | |
| 349 | err_unmap: |
| 350 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); |
| 351 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) |
| 352 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 353 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); |
| 354 | err_free: |
| 355 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) |
| 356 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 357 | return rc; |
| 358 | } |
| 359 | |
| 360 | /** |
| 361 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
| 362 | * @chunk: chunk to depopulate |
| 363 | * @off: offset to the area to depopulate |
| 364 | * @size: size of the area to depopulate in bytes |
| 365 | * @flush: whether to flush cache and tlb or not |
| 366 | * |
| 367 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
| 368 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
| 369 | * and tlb after. |
| 370 | * |
| 371 | * CONTEXT: |
| 372 | * pcpu_alloc_mutex. |
| 373 | */ |
| 374 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 375 | { |
| 376 | int page_start = PFN_DOWN(off); |
| 377 | int page_end = PFN_UP(off + size); |
| 378 | struct page **pages; |
| 379 | unsigned long *populated; |
| 380 | int rs, re; |
| 381 | |
| 382 | /* quick path, check whether it's empty already */ |
| 383 | rs = page_start; |
| 384 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
| 385 | if (rs == page_start && re == page_end) |
| 386 | return; |
| 387 | |
| 388 | /* immutable chunks can't be depopulated */ |
| 389 | WARN_ON(chunk->immutable); |
| 390 | |
| 391 | /* |
| 392 | * If control reaches here, there must have been at least one |
| 393 | * successful population attempt so the temp pages array must |
| 394 | * be available now. |
| 395 | */ |
| 396 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); |
| 397 | BUG_ON(!pages); |
| 398 | |
| 399 | /* unmap and free */ |
| 400 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
| 401 | |
| 402 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 403 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 404 | |
| 405 | /* no need to flush tlb, vmalloc will handle it lazily */ |
| 406 | |
| 407 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 408 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 409 | |
| 410 | /* commit new bitmap */ |
| 411 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 412 | } |
| 413 | |
| 414 | static struct pcpu_chunk *pcpu_create_chunk(void) |
| 415 | { |
| 416 | struct pcpu_chunk *chunk; |
| 417 | struct vm_struct **vms; |
| 418 | |
| 419 | chunk = pcpu_alloc_chunk(); |
| 420 | if (!chunk) |
| 421 | return NULL; |
| 422 | |
| 423 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
David Rientjes | ec3f64f | 2011-01-13 15:46:01 -0800 | [diff] [blame] | 424 | pcpu_nr_groups, pcpu_atom_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 425 | if (!vms) { |
| 426 | pcpu_free_chunk(chunk); |
| 427 | return NULL; |
| 428 | } |
| 429 | |
| 430 | chunk->data = vms; |
| 431 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; |
| 432 | return chunk; |
| 433 | } |
| 434 | |
| 435 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) |
| 436 | { |
| 437 | if (chunk && chunk->data) |
| 438 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); |
| 439 | pcpu_free_chunk(chunk); |
| 440 | } |
| 441 | |
| 442 | static struct page *pcpu_addr_to_page(void *addr) |
| 443 | { |
| 444 | return vmalloc_to_page(addr); |
| 445 | } |
| 446 | |
| 447 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) |
| 448 | { |
| 449 | /* no extra restriction */ |
| 450 | return 0; |
| 451 | } |