blob: 119d18611500f4ccde99916b1012bfba0b9f1039 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Becky Bruce41151e72011-06-28 09:54:48 +00002 * PPC Huge TLB Page Support for Kernel.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
Becky Bruce41151e72011-06-28 09:54:48 +00005 * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor
Linus Torvalds1da177e2005-04-16 15:20:36 -07006 *
7 * Based on the IA-32 version:
8 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
9 */
10
Linus Torvalds1da177e2005-04-16 15:20:36 -070011#include <linux/mm.h>
David Gibson883a3e52009-10-26 19:24:31 +000012#include <linux/io.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090013#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070014#include <linux/hugetlb.h>
Paul Mackerras342d3db2011-12-12 12:38:05 +000015#include <linux/export.h>
Becky Bruce41151e72011-06-28 09:54:48 +000016#include <linux/of_fdt.h>
17#include <linux/memblock.h>
18#include <linux/bootmem.h>
Kumar Gala13020be2011-11-24 09:40:07 +000019#include <linux/moduleparam.h>
David Gibson883a3e52009-10-26 19:24:31 +000020#include <asm/pgtable.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <asm/pgalloc.h>
22#include <asm/tlb.h>
Becky Bruce41151e72011-06-28 09:54:48 +000023#include <asm/setup.h>
Aneesh Kumar K.V29409992013-06-20 14:30:16 +053024#include <asm/hugetlb.h>
25
26#ifdef CONFIG_HUGETLB_PAGE
Linus Torvalds1da177e2005-04-16 15:20:36 -070027
Jon Tollefson91224342008-07-23 21:27:55 -070028#define PAGE_SHIFT_64K 16
29#define PAGE_SHIFT_16M 24
30#define PAGE_SHIFT_16G 34
Jon Tollefson4ec161c2008-01-04 09:59:50 +110031
Becky Bruce41151e72011-06-28 09:54:48 +000032unsigned int HPAGE_SHIFT;
33
34/*
35 * Tracks gpages after the device tree is scanned and before the
Becky Brucea6146882011-10-10 10:50:43 +000036 * huge_boot_pages list is ready. On non-Freescale implementations, this is
37 * just used to track 16G pages and so is a single array. FSL-based
38 * implementations may have more than one gpage size, so we need multiple
39 * arrays
Becky Bruce41151e72011-06-28 09:54:48 +000040 */
Becky Bruce881fde12011-10-10 10:50:40 +000041#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +000042#define MAX_NUMBER_GPAGES 128
43struct psize_gpages {
44 u64 gpage_list[MAX_NUMBER_GPAGES];
45 unsigned int nr_gpages;
46};
47static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT];
Becky Bruce881fde12011-10-10 10:50:40 +000048#else
49#define MAX_NUMBER_GPAGES 1024
50static u64 gpage_freearray[MAX_NUMBER_GPAGES];
51static unsigned nr_gpages;
Becky Bruce41151e72011-06-28 09:54:48 +000052#endif
David Gibsonf10a04c2006-04-28 15:02:51 +100053
David Gibsona4fe3ce2009-10-26 19:24:31 +000054#define hugepd_none(hpd) ((hpd).pd == 0)
55
David Gibsona4fe3ce2009-10-26 19:24:31 +000056pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
57{
Aneesh Kumar K.V12bc9f62013-06-20 14:30:18 +053058 /* Only called for hugetlbfs pages, hence can ignore THP */
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +053059 return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
David Gibsona4fe3ce2009-10-26 19:24:31 +000060}
61
62static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
63 unsigned long address, unsigned pdshift, unsigned pshift)
64{
Becky Bruce41151e72011-06-28 09:54:48 +000065 struct kmem_cache *cachep;
66 pte_t *new;
67
Becky Bruce881fde12011-10-10 10:50:40 +000068#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +000069 int i;
70 int num_hugepd = 1 << (pshift - pdshift);
71 cachep = hugepte_cache;
Becky Bruce881fde12011-10-10 10:50:40 +000072#else
73 cachep = PGT_CACHE(pdshift - pshift);
Becky Bruce41151e72011-06-28 09:54:48 +000074#endif
75
Michal Hocko2379a232016-06-24 14:49:12 -070076 new = kmem_cache_zalloc(cachep, GFP_KERNEL);
David Gibsonf10a04c2006-04-28 15:02:51 +100077
David Gibsona4fe3ce2009-10-26 19:24:31 +000078 BUG_ON(pshift > HUGEPD_SHIFT_MASK);
79 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
80
David Gibsonf10a04c2006-04-28 15:02:51 +100081 if (! new)
82 return -ENOMEM;
83
84 spin_lock(&mm->page_table_lock);
Becky Bruce881fde12011-10-10 10:50:40 +000085#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +000086 /*
87 * We have multiple higher-level entries that point to the same
88 * actual pte location. Fill in each as we go and backtrack on error.
89 * We need all of these so the DTLB pgtable walk code can find the
90 * right higher-level entry without knowing if it's a hugepage or not.
91 */
92 for (i = 0; i < num_hugepd; i++, hpdp++) {
93 if (unlikely(!hugepd_none(*hpdp)))
94 break;
95 else
Aneesh Kumar K.Vcf9427b2013-04-28 09:37:29 +000096 /* We use the old format for PPC_FSL_BOOK3E */
Becky Bruce41151e72011-06-28 09:54:48 +000097 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
98 }
99 /* If we bailed from the for loop early, an error occurred, clean up */
100 if (i < num_hugepd) {
101 for (i = i - 1 ; i >= 0; i--, hpdp--)
102 hpdp->pd = 0;
103 kmem_cache_free(cachep, new);
104 }
Becky Brucea1cd5412011-10-10 10:50:39 +0000105#else
106 if (!hugepd_none(*hpdp))
107 kmem_cache_free(cachep, new);
Aneesh Kumar K.Vcf9427b2013-04-28 09:37:29 +0000108 else {
109#ifdef CONFIG_PPC_BOOK3S_64
Paul Mackerrasc61a8842016-02-23 13:36:17 +1100110 hpdp->pd = __pa(new) | (shift_to_mmu_psize(pshift) << 2);
Aneesh Kumar K.Vcf9427b2013-04-28 09:37:29 +0000111#else
Becky Brucea1cd5412011-10-10 10:50:39 +0000112 hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
Becky Bruce41151e72011-06-28 09:54:48 +0000113#endif
Aneesh Kumar K.Vcf9427b2013-04-28 09:37:29 +0000114 }
115#endif
David Gibsonf10a04c2006-04-28 15:02:51 +1000116 spin_unlock(&mm->page_table_lock);
117 return 0;
118}
119
Becky Brucea1cd5412011-10-10 10:50:39 +0000120/*
121 * These macros define how to determine which level of the page table holds
122 * the hpdp.
123 */
124#ifdef CONFIG_PPC_FSL_BOOK3E
125#define HUGEPD_PGD_SHIFT PGDIR_SHIFT
126#define HUGEPD_PUD_SHIFT PUD_SHIFT
127#else
128#define HUGEPD_PGD_SHIFT PUD_SHIFT
129#define HUGEPD_PUD_SHIFT PMD_SHIFT
130#endif
131
Aneesh Kumar K.Ve2b3d202013-04-28 09:37:30 +0000132#ifdef CONFIG_PPC_BOOK3S_64
133/*
134 * At this point we do the placement change only for BOOK3S 64. This would
135 * possibly work on other subarchs.
136 */
137pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
138{
139 pgd_t *pg;
140 pud_t *pu;
141 pmd_t *pm;
142 hugepd_t *hpdp = NULL;
143 unsigned pshift = __ffs(sz);
144 unsigned pdshift = PGDIR_SHIFT;
145
146 addr &= ~(sz-1);
147 pg = pgd_offset(mm, addr);
148
149 if (pshift == PGDIR_SHIFT)
150 /* 16GB huge page */
151 return (pte_t *) pg;
152 else if (pshift > PUD_SHIFT)
153 /*
154 * We need to use hugepd table
155 */
156 hpdp = (hugepd_t *)pg;
157 else {
158 pdshift = PUD_SHIFT;
159 pu = pud_alloc(mm, pg, addr);
160 if (pshift == PUD_SHIFT)
161 return (pte_t *)pu;
162 else if (pshift > PMD_SHIFT)
163 hpdp = (hugepd_t *)pu;
164 else {
165 pdshift = PMD_SHIFT;
166 pm = pmd_alloc(mm, pu, addr);
167 if (pshift == PMD_SHIFT)
168 /* 16MB hugepage */
169 return (pte_t *)pm;
170 else
171 hpdp = (hugepd_t *)pm;
172 }
173 }
174 if (!hpdp)
175 return NULL;
176
177 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
178
179 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
180 return NULL;
181
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530182 return hugepte_offset(*hpdp, addr, pdshift);
Aneesh Kumar K.Ve2b3d202013-04-28 09:37:30 +0000183}
184
185#else
186
David Gibsona4fe3ce2009-10-26 19:24:31 +0000187pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
188{
189 pgd_t *pg;
190 pud_t *pu;
191 pmd_t *pm;
192 hugepd_t *hpdp = NULL;
193 unsigned pshift = __ffs(sz);
194 unsigned pdshift = PGDIR_SHIFT;
David Gibson0b264252008-09-05 11:49:54 +1000195
David Gibsona4fe3ce2009-10-26 19:24:31 +0000196 addr &= ~(sz-1);
197
198 pg = pgd_offset(mm, addr);
Becky Brucea1cd5412011-10-10 10:50:39 +0000199
200 if (pshift >= HUGEPD_PGD_SHIFT) {
David Gibsona4fe3ce2009-10-26 19:24:31 +0000201 hpdp = (hugepd_t *)pg;
202 } else {
203 pdshift = PUD_SHIFT;
204 pu = pud_alloc(mm, pg, addr);
Becky Brucea1cd5412011-10-10 10:50:39 +0000205 if (pshift >= HUGEPD_PUD_SHIFT) {
David Gibsona4fe3ce2009-10-26 19:24:31 +0000206 hpdp = (hugepd_t *)pu;
207 } else {
208 pdshift = PMD_SHIFT;
209 pm = pmd_alloc(mm, pu, addr);
210 hpdp = (hugepd_t *)pm;
211 }
212 }
213
214 if (!hpdp)
215 return NULL;
216
217 BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
218
219 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
220 return NULL;
221
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530222 return hugepte_offset(*hpdp, addr, pdshift);
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100223}
Aneesh Kumar K.Ve2b3d202013-04-28 09:37:30 +0000224#endif
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100225
Becky Bruce881fde12011-10-10 10:50:40 +0000226#ifdef CONFIG_PPC_FSL_BOOK3E
Jon Tollefson658013e2008-07-23 21:27:54 -0700227/* Build list of addresses of gigantic pages. This function is used in early
Anton Blanchard14ed7402014-09-17 22:15:34 +1000228 * boot before the buddy allocator is setup.
Jon Tollefson658013e2008-07-23 21:27:54 -0700229 */
Becky Bruce41151e72011-06-28 09:54:48 +0000230void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
231{
232 unsigned int idx = shift_to_mmu_psize(__ffs(page_size));
233 int i;
234
235 if (addr == 0)
236 return;
237
238 gpage_freearray[idx].nr_gpages = number_of_pages;
239
240 for (i = 0; i < number_of_pages; i++) {
241 gpage_freearray[idx].gpage_list[i] = addr;
242 addr += page_size;
243 }
244}
245
246/*
247 * Moves the gigantic page addresses from the temporary list to the
248 * huge_boot_pages list.
249 */
250int alloc_bootmem_huge_page(struct hstate *hstate)
251{
252 struct huge_bootmem_page *m;
Wanpeng Li2415cf12013-07-03 15:02:43 -0700253 int idx = shift_to_mmu_psize(huge_page_shift(hstate));
Becky Bruce41151e72011-06-28 09:54:48 +0000254 int nr_gpages = gpage_freearray[idx].nr_gpages;
255
256 if (nr_gpages == 0)
257 return 0;
258
259#ifdef CONFIG_HIGHMEM
260 /*
261 * If gpages can be in highmem we can't use the trick of storing the
262 * data structure in the page; allocate space for this
263 */
Michael Ellermane39f223f2014-11-18 16:47:35 +1100264 m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0);
Becky Bruce41151e72011-06-28 09:54:48 +0000265 m->phys = gpage_freearray[idx].gpage_list[--nr_gpages];
266#else
267 m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]);
268#endif
269
270 list_add(&m->list, &huge_boot_pages);
271 gpage_freearray[idx].nr_gpages = nr_gpages;
272 gpage_freearray[idx].gpage_list[nr_gpages] = 0;
273 m->hstate = hstate;
274
275 return 1;
276}
277/*
278 * Scan the command line hugepagesz= options for gigantic pages; store those in
279 * a list that we use to allocate the memory once all options are parsed.
280 */
281
282unsigned long gpage_npages[MMU_PAGE_COUNT];
283
Paul Gortmaker89528122012-05-07 10:32:22 -0400284static int __init do_gpage_early_setup(char *param, char *val,
Luis R. Rodriguezecc86172015-03-30 16:20:03 -0700285 const char *unused, void *arg)
Becky Bruce41151e72011-06-28 09:54:48 +0000286{
287 static phys_addr_t size;
288 unsigned long npages;
289
290 /*
291 * The hugepagesz and hugepages cmdline options are interleaved. We
292 * use the size variable to keep track of whether or not this was done
293 * properly and skip over instances where it is incorrect. Other
294 * command-line parsing code will issue warnings, so we don't need to.
295 *
296 */
297 if ((strcmp(param, "default_hugepagesz") == 0) ||
298 (strcmp(param, "hugepagesz") == 0)) {
299 size = memparse(val, NULL);
300 } else if (strcmp(param, "hugepages") == 0) {
301 if (size != 0) {
302 if (sscanf(val, "%lu", &npages) <= 0)
303 npages = 0;
James Yangc4f3eb52014-11-14 12:32:24 -0600304 if (npages > MAX_NUMBER_GPAGES) {
305 pr_warn("MMU: %lu pages requested for page "
306 "size %llu KB, limiting to "
307 __stringify(MAX_NUMBER_GPAGES) "\n",
308 npages, size / 1024);
309 npages = MAX_NUMBER_GPAGES;
310 }
Becky Bruce41151e72011-06-28 09:54:48 +0000311 gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages;
312 size = 0;
313 }
314 }
315 return 0;
316}
317
318
319/*
320 * This function allocates physical space for pages that are larger than the
321 * buddy allocator can handle. We want to allocate these in highmem because
322 * the amount of lowmem is limited. This means that this function MUST be
323 * called before lowmem_end_addr is set up in MMU_init() in order for the lmb
324 * allocate to grab highmem.
325 */
326void __init reserve_hugetlb_gpages(void)
327{
328 static __initdata char cmdline[COMMAND_LINE_SIZE];
329 phys_addr_t size, base;
330 int i;
331
332 strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
Pawel Moll026cee02012-03-26 12:50:51 +1030333 parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
Luis R. Rodriguezecc86172015-03-30 16:20:03 -0700334 NULL, &do_gpage_early_setup);
Becky Bruce41151e72011-06-28 09:54:48 +0000335
336 /*
337 * Walk gpage list in reverse, allocating larger page sizes first.
338 * Skip over unsupported sizes, or sizes that have 0 gpages allocated.
339 * When we reach the point in the list where pages are no longer
340 * considered gpages, we're done.
341 */
342 for (i = MMU_PAGE_COUNT-1; i >= 0; i--) {
343 if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0)
344 continue;
345 else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT))
346 break;
347
348 size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i));
349 base = memblock_alloc_base(size * gpage_npages[i], size,
350 MEMBLOCK_ALLOC_ANYWHERE);
351 add_gpage(base, size, gpage_npages[i]);
352 }
353}
354
Becky Bruce881fde12011-10-10 10:50:40 +0000355#else /* !PPC_FSL_BOOK3E */
Becky Bruce41151e72011-06-28 09:54:48 +0000356
357/* Build list of addresses of gigantic pages. This function is used in early
Anton Blanchard14ed7402014-09-17 22:15:34 +1000358 * boot before the buddy allocator is setup.
Becky Bruce41151e72011-06-28 09:54:48 +0000359 */
360void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)
Jon Tollefson658013e2008-07-23 21:27:54 -0700361{
362 if (!addr)
363 return;
364 while (number_of_pages > 0) {
365 gpage_freearray[nr_gpages] = addr;
366 nr_gpages++;
367 number_of_pages--;
368 addr += page_size;
369 }
370}
371
Jon Tollefsonec4b2c02008-07-23 21:27:53 -0700372/* Moves the gigantic page addresses from the temporary list to the
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700373 * huge_boot_pages list.
374 */
375int alloc_bootmem_huge_page(struct hstate *hstate)
Jon Tollefsonec4b2c02008-07-23 21:27:53 -0700376{
377 struct huge_bootmem_page *m;
378 if (nr_gpages == 0)
379 return 0;
380 m = phys_to_virt(gpage_freearray[--nr_gpages]);
381 gpage_freearray[nr_gpages] = 0;
382 list_add(&m->list, &huge_boot_pages);
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700383 m->hstate = hstate;
Jon Tollefsonec4b2c02008-07-23 21:27:53 -0700384 return 1;
385}
Becky Bruce41151e72011-06-28 09:54:48 +0000386#endif
Jon Tollefsonec4b2c02008-07-23 21:27:53 -0700387
Becky Bruce881fde12011-10-10 10:50:40 +0000388#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +0000389#define HUGEPD_FREELIST_SIZE \
390 ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t))
391
392struct hugepd_freelist {
393 struct rcu_head rcu;
394 unsigned int index;
395 void *ptes[0];
396};
397
398static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur);
399
400static void hugepd_free_rcu_callback(struct rcu_head *head)
401{
402 struct hugepd_freelist *batch =
403 container_of(head, struct hugepd_freelist, rcu);
404 unsigned int i;
405
406 for (i = 0; i < batch->index; i++)
407 kmem_cache_free(hugepte_cache, batch->ptes[i]);
408
409 free_page((unsigned long)batch);
410}
411
412static void hugepd_free(struct mmu_gather *tlb, void *hugepte)
413{
414 struct hugepd_freelist **batchp;
415
Sebastian Siewior08a5bb22016-03-08 10:03:56 +0100416 batchp = &get_cpu_var(hugepd_freelist_cur);
Becky Bruce41151e72011-06-28 09:54:48 +0000417
418 if (atomic_read(&tlb->mm->mm_users) < 2 ||
419 cpumask_equal(mm_cpumask(tlb->mm),
420 cpumask_of(smp_processor_id()))) {
421 kmem_cache_free(hugepte_cache, hugepte);
Sebastian Siewior08a5bb22016-03-08 10:03:56 +0100422 put_cpu_var(hugepd_freelist_cur);
Becky Bruce41151e72011-06-28 09:54:48 +0000423 return;
424 }
425
426 if (*batchp == NULL) {
427 *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC);
428 (*batchp)->index = 0;
429 }
430
431 (*batchp)->ptes[(*batchp)->index++] = hugepte;
432 if ((*batchp)->index == HUGEPD_FREELIST_SIZE) {
433 call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback);
434 *batchp = NULL;
435 }
Tiejun Chen94b09d72014-01-20 16:39:34 +0800436 put_cpu_var(hugepd_freelist_cur);
Becky Bruce41151e72011-06-28 09:54:48 +0000437}
438#endif
439
David Gibsona4fe3ce2009-10-26 19:24:31 +0000440static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
441 unsigned long start, unsigned long end,
442 unsigned long floor, unsigned long ceiling)
David Gibsonf10a04c2006-04-28 15:02:51 +1000443{
444 pte_t *hugepte = hugepd_page(*hpdp);
Becky Bruce41151e72011-06-28 09:54:48 +0000445 int i;
446
David Gibsona4fe3ce2009-10-26 19:24:31 +0000447 unsigned long pdmask = ~((1UL << pdshift) - 1);
Becky Bruce41151e72011-06-28 09:54:48 +0000448 unsigned int num_hugepd = 1;
449
Becky Bruce881fde12011-10-10 10:50:40 +0000450#ifdef CONFIG_PPC_FSL_BOOK3E
451 /* Note: On fsl the hpdp may be the first of several */
Becky Bruce41151e72011-06-28 09:54:48 +0000452 num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift));
Becky Bruce881fde12011-10-10 10:50:40 +0000453#else
454 unsigned int shift = hugepd_shift(*hpdp);
Becky Bruce41151e72011-06-28 09:54:48 +0000455#endif
David Gibsona4fe3ce2009-10-26 19:24:31 +0000456
457 start &= pdmask;
458 if (start < floor)
459 return;
460 if (ceiling) {
461 ceiling &= pdmask;
462 if (! ceiling)
463 return;
464 }
465 if (end - 1 > ceiling - 1)
466 return;
David Gibsonf10a04c2006-04-28 15:02:51 +1000467
Becky Bruce41151e72011-06-28 09:54:48 +0000468 for (i = 0; i < num_hugepd; i++, hpdp++)
469 hpdp->pd = 0;
470
Becky Bruce881fde12011-10-10 10:50:40 +0000471#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +0000472 hugepd_free(tlb, hugepte);
Becky Bruce881fde12011-10-10 10:50:40 +0000473#else
474 pgtable_free_tlb(tlb, hugepte, pdshift - shift);
Becky Bruce41151e72011-06-28 09:54:48 +0000475#endif
David Gibsonf10a04c2006-04-28 15:02:51 +1000476}
477
David Gibsonf10a04c2006-04-28 15:02:51 +1000478static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
479 unsigned long addr, unsigned long end,
David Gibsona4fe3ce2009-10-26 19:24:31 +0000480 unsigned long floor, unsigned long ceiling)
David Gibsonf10a04c2006-04-28 15:02:51 +1000481{
482 pmd_t *pmd;
483 unsigned long next;
484 unsigned long start;
485
486 start = addr;
David Gibsonf10a04c2006-04-28 15:02:51 +1000487 do {
Becky Brucea1cd5412011-10-10 10:50:39 +0000488 pmd = pmd_offset(pud, addr);
David Gibsonf10a04c2006-04-28 15:02:51 +1000489 next = pmd_addr_end(addr, end);
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530490 if (!is_hugepd(__hugepd(pmd_val(*pmd)))) {
Aneesh Kumar K.V8bbd9f02013-06-19 12:04:26 +0530491 /*
492 * if it is not hugepd pointer, we should already find
493 * it cleared.
494 */
495 WARN_ON(!pmd_none_or_clear_bad(pmd));
David Gibsonf10a04c2006-04-28 15:02:51 +1000496 continue;
Aneesh Kumar K.V8bbd9f02013-06-19 12:04:26 +0530497 }
Becky Brucea1cd5412011-10-10 10:50:39 +0000498#ifdef CONFIG_PPC_FSL_BOOK3E
499 /*
500 * Increment next by the size of the huge mapping since
501 * there may be more than one entry at this level for a
502 * single hugepage, but all of them point to
503 * the same kmem cache that holds the hugepte.
504 */
505 next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd));
506#endif
David Gibsona4fe3ce2009-10-26 19:24:31 +0000507 free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
508 addr, next, floor, ceiling);
Becky Brucea1cd5412011-10-10 10:50:39 +0000509 } while (addr = next, addr != end);
David Gibsonf10a04c2006-04-28 15:02:51 +1000510
511 start &= PUD_MASK;
512 if (start < floor)
513 return;
514 if (ceiling) {
515 ceiling &= PUD_MASK;
516 if (!ceiling)
517 return;
518 }
519 if (end - 1 > ceiling - 1)
520 return;
521
522 pmd = pmd_offset(pud, start);
523 pud_clear(pud);
Benjamin Herrenschmidt9e1b32c2009-07-22 15:44:28 +1000524 pmd_free_tlb(tlb, pmd, start);
Scott Wood50c6a662015-04-10 19:37:34 -0500525 mm_dec_nr_pmds(tlb->mm);
David Gibsonf10a04c2006-04-28 15:02:51 +1000526}
David Gibsonf10a04c2006-04-28 15:02:51 +1000527
528static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
529 unsigned long addr, unsigned long end,
530 unsigned long floor, unsigned long ceiling)
531{
532 pud_t *pud;
533 unsigned long next;
534 unsigned long start;
535
536 start = addr;
David Gibsonf10a04c2006-04-28 15:02:51 +1000537 do {
Becky Brucea1cd5412011-10-10 10:50:39 +0000538 pud = pud_offset(pgd, addr);
David Gibsonf10a04c2006-04-28 15:02:51 +1000539 next = pud_addr_end(addr, end);
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530540 if (!is_hugepd(__hugepd(pud_val(*pud)))) {
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100541 if (pud_none_or_clear_bad(pud))
542 continue;
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700543 hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
David Gibsona4fe3ce2009-10-26 19:24:31 +0000544 ceiling);
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100545 } else {
Becky Brucea1cd5412011-10-10 10:50:39 +0000546#ifdef CONFIG_PPC_FSL_BOOK3E
547 /*
548 * Increment next by the size of the huge mapping since
549 * there may be more than one entry at this level for a
550 * single hugepage, but all of them point to
551 * the same kmem cache that holds the hugepte.
552 */
553 next = addr + (1 << hugepd_shift(*(hugepd_t *)pud));
554#endif
David Gibsona4fe3ce2009-10-26 19:24:31 +0000555 free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
556 addr, next, floor, ceiling);
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100557 }
Becky Brucea1cd5412011-10-10 10:50:39 +0000558 } while (addr = next, addr != end);
David Gibsonf10a04c2006-04-28 15:02:51 +1000559
560 start &= PGDIR_MASK;
561 if (start < floor)
562 return;
563 if (ceiling) {
564 ceiling &= PGDIR_MASK;
565 if (!ceiling)
566 return;
567 }
568 if (end - 1 > ceiling - 1)
569 return;
570
571 pud = pud_offset(pgd, start);
572 pgd_clear(pgd);
Benjamin Herrenschmidt9e1b32c2009-07-22 15:44:28 +1000573 pud_free_tlb(tlb, pud, start);
David Gibsonf10a04c2006-04-28 15:02:51 +1000574}
575
576/*
577 * This function frees user-level page tables of a process.
David Gibsonf10a04c2006-04-28 15:02:51 +1000578 */
Jan Beulich42b77722008-07-23 21:27:10 -0700579void hugetlb_free_pgd_range(struct mmu_gather *tlb,
David Gibsonf10a04c2006-04-28 15:02:51 +1000580 unsigned long addr, unsigned long end,
581 unsigned long floor, unsigned long ceiling)
582{
583 pgd_t *pgd;
584 unsigned long next;
David Gibsonf10a04c2006-04-28 15:02:51 +1000585
586 /*
David Gibsona4fe3ce2009-10-26 19:24:31 +0000587 * Because there are a number of different possible pagetable
588 * layouts for hugepage ranges, we limit knowledge of how
589 * things should be laid out to the allocation path
590 * (huge_pte_alloc(), above). Everything else works out the
591 * structure as it goes from information in the hugepd
592 * pointers. That means that we can't here use the
593 * optimization used in the normal page free_pgd_range(), of
594 * checking whether we're actually covering a large enough
595 * range to have to do anything at the top level of the walk
596 * instead of at the bottom.
David Gibsonf10a04c2006-04-28 15:02:51 +1000597 *
David Gibsona4fe3ce2009-10-26 19:24:31 +0000598 * To make sense of this, you should probably go read the big
599 * block comment at the top of the normal free_pgd_range(),
600 * too.
David Gibsonf10a04c2006-04-28 15:02:51 +1000601 */
602
David Gibsonf10a04c2006-04-28 15:02:51 +1000603 do {
David Gibsonf10a04c2006-04-28 15:02:51 +1000604 next = pgd_addr_end(addr, end);
Becky Bruce41151e72011-06-28 09:54:48 +0000605 pgd = pgd_offset(tlb->mm, addr);
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530606 if (!is_hugepd(__hugepd(pgd_val(*pgd)))) {
David Gibson0b264252008-09-05 11:49:54 +1000607 if (pgd_none_or_clear_bad(pgd))
608 continue;
609 hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
610 } else {
Becky Bruce881fde12011-10-10 10:50:40 +0000611#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +0000612 /*
613 * Increment next by the size of the huge mapping since
Becky Bruce881fde12011-10-10 10:50:40 +0000614 * there may be more than one entry at the pgd level
615 * for a single hugepage, but all of them point to the
616 * same kmem cache that holds the hugepte.
Becky Bruce41151e72011-06-28 09:54:48 +0000617 */
618 next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd));
619#endif
David Gibsona4fe3ce2009-10-26 19:24:31 +0000620 free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
621 addr, next, floor, ceiling);
David Gibson0b264252008-09-05 11:49:54 +1000622 }
Becky Bruce41151e72011-06-28 09:54:48 +0000623 } while (addr = next, addr != end);
David Gibsone28f7fa2005-08-05 19:39:06 +1000624}
625
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530626/*
627 * We are holding mmap_sem, so a parallel huge page collapse cannot run.
628 * To prevent hugepage split, disable irq.
629 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630struct page *
631follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
632{
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530633 bool is_thp;
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530634 pte_t *ptep, pte;
David Gibsona4fe3ce2009-10-26 19:24:31 +0000635 unsigned shift;
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530636 unsigned long mask, flags;
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530637 struct page *page = ERR_PTR(-EINVAL);
638
639 local_irq_save(flags);
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530640 ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530641 if (!ptep)
642 goto no_page;
643 pte = READ_ONCE(*ptep);
Aneesh Kumar K.V12bc9f62013-06-20 14:30:18 +0530644 /*
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530645 * Verify it is a huge page else bail.
Aneesh Kumar K.V12bc9f62013-06-20 14:30:18 +0530646 * Transparent hugepages are handled by generic code. We can skip them
647 * here.
648 */
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530649 if (!shift || is_thp)
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530650 goto no_page;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530652 if (!pte_present(pte)) {
653 page = NULL;
654 goto no_page;
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530655 }
David Gibsona4fe3ce2009-10-26 19:24:31 +0000656 mask = (1UL << shift) - 1;
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530657 page = pte_page(pte);
David Gibsona4fe3ce2009-10-26 19:24:31 +0000658 if (page)
659 page += (address & mask) / PAGE_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660
Aneesh Kumar K.V7b868e82015-05-11 11:58:29 +0530661no_page:
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530662 local_irq_restore(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 return page;
664}
665
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666struct page *
667follow_huge_pmd(struct mm_struct *mm, unsigned long address,
668 pmd_t *pmd, int write)
669{
670 BUG();
671 return NULL;
672}
673
Naoya Horiguchi61f77ed2015-02-11 15:25:15 -0800674struct page *
675follow_huge_pud(struct mm_struct *mm, unsigned long address,
676 pud_t *pud, int write)
677{
678 BUG();
679 return NULL;
680}
681
David Gibson39adfa52009-11-23 20:03:40 +0000682static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
683 unsigned long sz)
684{
685 unsigned long __boundary = (addr + sz) & ~(sz-1);
686 return (__boundary - 1 < end - 1) ? __boundary : end;
687}
688
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530689int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift,
690 unsigned long end, int write, struct page **pages, int *nr)
David Gibsona4fe3ce2009-10-26 19:24:31 +0000691{
692 pte_t *ptep;
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530693 unsigned long sz = 1UL << hugepd_shift(hugepd);
David Gibson39adfa52009-11-23 20:03:40 +0000694 unsigned long next;
David Gibsona4fe3ce2009-10-26 19:24:31 +0000695
696 ptep = hugepte_offset(hugepd, addr, pdshift);
697 do {
David Gibson39adfa52009-11-23 20:03:40 +0000698 next = hugepte_addr_end(addr, end, sz);
David Gibsona4fe3ce2009-10-26 19:24:31 +0000699 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
700 return 0;
David Gibson39adfa52009-11-23 20:03:40 +0000701 } while (ptep++, addr = next, addr != end);
David Gibsona4fe3ce2009-10-26 19:24:31 +0000702
703 return 1;
704}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705
Becky Bruce76512952011-10-10 10:50:36 +0000706#ifdef CONFIG_PPC_MM_SLICES
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
708 unsigned long len, unsigned long pgoff,
709 unsigned long flags)
710{
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700711 struct hstate *hstate = hstate_file(file);
712 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
Brian King48f797d2008-12-04 04:07:54 +0000713
Aneesh Kumar K.V48483762016-04-29 23:26:25 +1000714 if (radix_enabled())
715 return radix__hugetlb_get_unmapped_area(file, addr, len,
716 pgoff, flags);
Michel Lespinasse34d07172013-04-29 11:53:52 -0700717 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718}
Becky Bruce76512952011-10-10 10:50:36 +0000719#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
Mel Gorman33402892009-01-06 14:38:54 -0800721unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
722{
Paul Mackerras25c29f92011-09-20 19:58:10 +0000723#ifdef CONFIG_PPC_MM_SLICES
Mel Gorman33402892009-01-06 14:38:54 -0800724 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
Aneesh Kumar K.V2f5f0df2016-04-29 23:26:24 +1000725 /* With radix we don't use slice, so derive it from vma*/
726 if (!radix_enabled())
727 return 1UL << mmu_psize_to_shift(psize);
728#endif
Becky Bruce41151e72011-06-28 09:54:48 +0000729 if (!is_vm_hugetlb_page(vma))
730 return PAGE_SIZE;
731
732 return huge_page_size(hstate_vma(vma));
Becky Bruce41151e72011-06-28 09:54:48 +0000733}
734
735static inline bool is_power_of_4(unsigned long x)
736{
737 if (is_power_of_2(x))
738 return (__ilog2(x) % 2) ? false : true;
739 return false;
Mel Gorman33402892009-01-06 14:38:54 -0800740}
741
David Gibsond1837cb2009-10-26 19:24:31 +0000742static int __init add_huge_page_size(unsigned long long size)
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100743{
David Gibsond1837cb2009-10-26 19:24:31 +0000744 int shift = __ffs(size);
745 int mmu_psize;
David Gibsona4fe3ce2009-10-26 19:24:31 +0000746
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100747 /* Check that it is a page size supported by the hardware and
David Gibsond1837cb2009-10-26 19:24:31 +0000748 * that it fits within pagetable and slice limits. */
Becky Bruce41151e72011-06-28 09:54:48 +0000749#ifdef CONFIG_PPC_FSL_BOOK3E
750 if ((size < PAGE_SIZE) || !is_power_of_4(size))
751 return -EINVAL;
752#else
David Gibsond1837cb2009-10-26 19:24:31 +0000753 if (!is_power_of_2(size)
754 || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
755 return -EINVAL;
Becky Bruce41151e72011-06-28 09:54:48 +0000756#endif
Jon Tollefson91224342008-07-23 21:27:55 -0700757
David Gibsond1837cb2009-10-26 19:24:31 +0000758 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
759 return -EINVAL;
760
David Gibsond1837cb2009-10-26 19:24:31 +0000761 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
762
763 /* Return if huge page size has already been setup */
764 if (size_to_hstate(size))
765 return 0;
766
767 hugetlb_add_hstate(shift - PAGE_SHIFT);
768
769 return 0;
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100770}
771
772static int __init hugepage_setup_sz(char *str)
773{
774 unsigned long long size;
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100775
776 size = memparse(str, &str);
777
Vaishali Thakkar71bf79c2016-05-19 17:11:14 -0700778 if (add_huge_page_size(size) != 0) {
779 hugetlb_bad_size();
780 pr_err("Invalid huge page size specified(%llu)\n", size);
781 }
Jon Tollefson4ec161c2008-01-04 09:59:50 +1100782
783 return 1;
784}
785__setup("hugepagesz=", hugepage_setup_sz);
786
Becky Bruce881fde12011-10-10 10:50:40 +0000787#ifdef CONFIG_PPC_FSL_BOOK3E
Becky Bruce41151e72011-06-28 09:54:48 +0000788struct kmem_cache *hugepte_cache;
789static int __init hugetlbpage_init(void)
790{
791 int psize;
792
793 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
794 unsigned shift;
795
796 if (!mmu_psize_defs[psize].shift)
797 continue;
798
799 shift = mmu_psize_to_shift(psize);
800
801 /* Don't treat normal page sizes as huge... */
802 if (shift != PAGE_SHIFT)
803 if (add_huge_page_size(1ULL << shift) < 0)
804 continue;
805 }
806
807 /*
808 * Create a kmem cache for hugeptes. The bottom bits in the pte have
809 * size information encoded in them, so align them to allow this
810 */
811 hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t),
812 HUGEPD_SHIFT_MASK + 1, 0, NULL);
813 if (hugepte_cache == NULL)
814 panic("%s: Unable to create kmem cache for hugeptes\n",
815 __func__);
816
817 /* Default hpage size = 4M */
818 if (mmu_psize_defs[MMU_PAGE_4M].shift)
819 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift;
820 else
821 panic("%s: Unable to set default huge page size\n", __func__);
822
823
824 return 0;
825}
826#else
David Gibsonf10a04c2006-04-28 15:02:51 +1000827static int __init hugetlbpage_init(void)
828{
David Gibsona4fe3ce2009-10-26 19:24:31 +0000829 int psize;
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700830
Aneesh Kumar K.V48483762016-04-29 23:26:25 +1000831 if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
David Gibsonf10a04c2006-04-28 15:02:51 +1000832 return -ENODEV;
Benjamin Herrenschmidt00df4382008-07-28 16:13:18 +1000833
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700834 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
David Gibsond1837cb2009-10-26 19:24:31 +0000835 unsigned shift;
836 unsigned pdshift;
837
838 if (!mmu_psize_defs[psize].shift)
839 continue;
840
841 shift = mmu_psize_to_shift(psize);
842
843 if (add_huge_page_size(1ULL << shift) < 0)
844 continue;
845
846 if (shift < PMD_SHIFT)
847 pdshift = PMD_SHIFT;
848 else if (shift < PUD_SHIFT)
849 pdshift = PUD_SHIFT;
850 else
851 pdshift = PGDIR_SHIFT;
Aneesh Kumar K.Ve2b3d202013-04-28 09:37:30 +0000852 /*
853 * if we have pdshift and shift value same, we don't
854 * use pgt cache for hugepd.
855 */
856 if (pdshift != shift) {
857 pgtable_cache_add(pdshift - shift, NULL);
858 if (!PGT_CACHE(pdshift - shift))
859 panic("hugetlbpage_init(): could not create "
860 "pgtable cache for %d bit pagesize\n", shift);
861 }
Jon Tollefson0d9ea752008-07-23 21:27:56 -0700862 }
David Gibsonf10a04c2006-04-28 15:02:51 +1000863
David Gibsond1837cb2009-10-26 19:24:31 +0000864 /* Set default large page size. Currently, we pick 16M or 1M
865 * depending on what is available
866 */
867 if (mmu_psize_defs[MMU_PAGE_16M].shift)
868 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
869 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
870 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
Aneesh Kumar K.V48483762016-04-29 23:26:25 +1000871 else if (mmu_psize_defs[MMU_PAGE_2M].shift)
872 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift;
873
David Gibsond1837cb2009-10-26 19:24:31 +0000874
David Gibsonf10a04c2006-04-28 15:02:51 +1000875 return 0;
876}
Becky Bruce41151e72011-06-28 09:54:48 +0000877#endif
Paul Gortmaker6f114282015-05-01 20:08:21 -0400878arch_initcall(hugetlbpage_init);
David Gibson0895ecd2009-10-26 19:24:31 +0000879
880void flush_dcache_icache_hugepage(struct page *page)
881{
882 int i;
Becky Bruce41151e72011-06-28 09:54:48 +0000883 void *start;
David Gibson0895ecd2009-10-26 19:24:31 +0000884
885 BUG_ON(!PageCompound(page));
886
Becky Bruce41151e72011-06-28 09:54:48 +0000887 for (i = 0; i < (1UL << compound_order(page)); i++) {
888 if (!PageHighMem(page)) {
889 __flush_dcache_icache(page_address(page+i));
890 } else {
Cong Wang2480b202011-11-25 23:14:16 +0800891 start = kmap_atomic(page+i);
Becky Bruce41151e72011-06-28 09:54:48 +0000892 __flush_dcache_icache(start);
Cong Wang2480b202011-11-25 23:14:16 +0800893 kunmap_atomic(start);
Becky Bruce41151e72011-06-28 09:54:48 +0000894 }
895 }
David Gibson0895ecd2009-10-26 19:24:31 +0000896}
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530897
898#endif /* CONFIG_HUGETLB_PAGE */
899
900/*
901 * We have 4 cases for pgds and pmds:
902 * (1) invalid (all zeroes)
903 * (2) pointer to next table, as normal; bottom 6 bits == 0
Aneesh Kumar K.V6a119ea2015-12-01 09:06:54 +0530904 * (3) leaf pte for huge page _PAGE_PTE set
905 * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530906 *
907 * So long as we atomically load page table pointers we are safe against teardown,
908 * we can follow the address down to the the page and take a ref on it.
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530909 * This function need to be called with interrupts disabled. We use this variant
910 * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530911 */
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530912
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530913pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530914 bool *is_thp, unsigned *shift)
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530915{
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530916 pgd_t pgd, *pgdp;
917 pud_t pud, *pudp;
918 pmd_t pmd, *pmdp;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530919 pte_t *ret_pte;
920 hugepd_t *hpdp = NULL;
921 unsigned pdshift = PGDIR_SHIFT;
922
923 if (shift)
924 *shift = 0;
925
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530926 if (is_thp)
927 *is_thp = false;
928
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530929 pgdp = pgdir + pgd_index(ea);
Michael Ellerman4f9c53c2015-03-25 20:11:57 +1100930 pgd = READ_ONCE(*pgdp);
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530931 /*
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530932 * Always operate on the local stack value. This make sure the
933 * value don't get updated by a parallel THP split/collapse,
934 * page fault or a page unmap. The return pte_t * is still not
935 * stable. So should be checked there for above conditions.
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530936 */
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530937 if (pgd_none(pgd))
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530938 return NULL;
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530939 else if (pgd_huge(pgd)) {
940 ret_pte = (pte_t *) pgdp;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530941 goto out;
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530942 } else if (is_hugepd(__hugepd(pgd_val(pgd))))
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530943 hpdp = (hugepd_t *)&pgd;
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530944 else {
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530945 /*
946 * Even if we end up with an unmap, the pgtable will not
947 * be freed, because we do an rcu free and here we are
948 * irq disabled
949 */
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530950 pdshift = PUD_SHIFT;
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530951 pudp = pud_offset(&pgd, ea);
Christian Borntraegerda1a2882015-01-06 22:47:41 +0100952 pud = READ_ONCE(*pudp);
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530953
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530954 if (pud_none(pud))
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530955 return NULL;
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530956 else if (pud_huge(pud)) {
957 ret_pte = (pte_t *) pudp;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530958 goto out;
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530959 } else if (is_hugepd(__hugepd(pud_val(pud))))
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530960 hpdp = (hugepd_t *)&pud;
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530961 else {
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530962 pdshift = PMD_SHIFT;
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530963 pmdp = pmd_offset(&pud, ea);
Christian Borntraegerda1a2882015-01-06 22:47:41 +0100964 pmd = READ_ONCE(*pmdp);
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530965 /*
966 * A hugepage collapse is captured by pmd_none, because
967 * it mark the pmd none and do a hpte invalidate.
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530968 */
Aneesh Kumar K.V7d6e7f72015-03-30 10:41:04 +0530969 if (pmd_none(pmd))
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530970 return NULL;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530971
Aneesh Kumar K.V891121e2015-10-09 08:32:21 +0530972 if (pmd_trans_huge(pmd)) {
973 if (is_thp)
974 *is_thp = true;
975 ret_pte = (pte_t *) pmdp;
976 goto out;
977 }
978
979 if (pmd_huge(pmd)) {
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530980 ret_pte = (pte_t *) pmdp;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530981 goto out;
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530982 } else if (is_hugepd(__hugepd(pmd_val(pmd))))
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530983 hpdp = (hugepd_t *)&pmd;
Aneesh Kumar K.Vac52ae42013-06-20 14:30:17 +0530984 else
Aneesh Kumar K.V0ac52dd2013-06-20 14:30:22 +0530985 return pte_offset_kernel(&pmd, ea);
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530986 }
987 }
988 if (!hpdp)
989 return NULL;
990
Aneesh Kumar K.Vb30e7592014-11-05 21:57:41 +0530991 ret_pte = hugepte_offset(*hpdp, ea, pdshift);
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530992 pdshift = hugepd_shift(*hpdp);
993out:
994 if (shift)
995 *shift = pdshift;
996 return ret_pte;
997}
Aneesh Kumar K.V691e95f2015-03-30 10:41:03 +0530998EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
Aneesh Kumar K.V29409992013-06-20 14:30:16 +0530999
1000int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
1001 unsigned long end, int write, struct page **pages, int *nr)
1002{
1003 unsigned long mask;
1004 unsigned long pte_end;
Kirill A. Shutemovddc58f22016-01-15 16:52:56 -08001005 struct page *head, *page;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +05301006 pte_t pte;
1007 int refs;
1008
1009 pte_end = (addr + sz) & ~(sz-1);
1010 if (pte_end < end)
1011 end = pte_end;
1012
Michael Ellerman4f9c53c2015-03-25 20:11:57 +11001013 pte = READ_ONCE(*ptep);
Aneesh Kumar K.Vac29c642016-04-29 23:25:34 +10001014 mask = _PAGE_PRESENT | _PAGE_READ;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +05301015 if (write)
Aneesh Kumar K.Vc7d54842016-04-29 23:25:30 +10001016 mask |= _PAGE_WRITE;
Aneesh Kumar K.V29409992013-06-20 14:30:16 +05301017
1018 if ((pte_val(pte) & mask) != mask)
1019 return 0;
1020
1021 /* hugepages are never "special" */
1022 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
1023
1024 refs = 0;
1025 head = pte_page(pte);
1026
1027 page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
Aneesh Kumar K.V29409992013-06-20 14:30:16 +05301028 do {
1029 VM_BUG_ON(compound_head(page) != head);
1030 pages[*nr] = page;
1031 (*nr)++;
1032 page++;
1033 refs++;
1034 } while (addr += PAGE_SIZE, addr != end);
1035
1036 if (!page_cache_add_speculative(head, refs)) {
1037 *nr -= refs;
1038 return 0;
1039 }
1040
1041 if (unlikely(pte_val(pte) != pte_val(*ptep))) {
1042 /* Could be optimized better */
1043 *nr -= refs;
1044 while (refs--)
1045 put_page(head);
1046 return 0;
1047 }
1048
Aneesh Kumar K.V29409992013-06-20 14:30:16 +05301049 return 1;
1050}