Blame - arch/powerpc/mm/pgtable-hash64.c - kernel/msm-4.9

blob: c23e286a6b8ff85822cd1eff1c24bd91baa2bfd0 [file] [log] [blame]

Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	1	/*
				2	* Copyright 2005, Paul Mackerras, IBM Corporation.
				3	* Copyright 2009, Benjamin Herrenschmidt, IBM Corporation.
				4	* Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version
				9	* 2 of the License, or (at your option) any later version.
				10	*/
				11
				12	#include <linux/sched.h>
				13	#include <asm/pgalloc.h>
				14	#include <asm/tlb.h>
				15
				16	#include "mmu_decl.h"
				17
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	18	#define CREATE_TRACE_POINTS
				19	#include <trace/events/thp.h>
				20
Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	21	#ifdef CONFIG_SPARSEMEM_VMEMMAP
				22	/*
				23	* On hash-based CPUs, the vmemmap is bolted in the hash table.
				24	*
				25	*/
Aneesh Kumar K.V	31a14fa	2016-04-29 23:25:59 +1000	[diff] [blame]	26	int __meminit hash__vmemmap_create_mapping(unsigned long start,
				27	unsigned long page_size,
				28	unsigned long phys)
Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	29	{
				30	int rc = htab_bolt_mapping(start, start + page_size, phys,
				31	pgprot_val(PAGE_KERNEL),
				32	mmu_vmemmap_psize, mmu_kernel_ssize);
				33	if (rc < 0) {
				34	int rc2 = htab_remove_mapping(start, start + page_size,
				35	mmu_vmemmap_psize,
				36	mmu_kernel_ssize);
				37	BUG_ON(rc2 && (rc2 != -ENOENT));
				38	}
				39	return rc;
				40	}
				41
				42	#ifdef CONFIG_MEMORY_HOTPLUG
Aneesh Kumar K.V	31a14fa	2016-04-29 23:25:59 +1000	[diff] [blame]	43	void hash__vmemmap_remove_mapping(unsigned long start,
				44	unsigned long page_size)
Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	45	{
				46	int rc = htab_remove_mapping(start, start + page_size,
				47	mmu_vmemmap_psize,
				48	mmu_kernel_ssize);
				49	BUG_ON((rc < 0) && (rc != -ENOENT));
				50	WARN_ON(rc == -ENOENT);
				51	}
				52	#endif
				53	#endif /* CONFIG_SPARSEMEM_VMEMMAP */
				54
				55	/*
				56	* map_kernel_page currently only called by __ioremap
				57	* map_kernel_page adds an entry to the ioremap page table
				58	* and adds an entry to the HPT, possibly bolting it
				59	*/
Aneesh Kumar K.V	31a14fa	2016-04-29 23:25:59 +1000	[diff] [blame]	60	int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	61	{
				62	pgd_t *pgdp;
				63	pud_t *pudp;
				64	pmd_t *pmdp;
				65	pte_t *ptep;
				66
Aneesh Kumar K.V	dd1842a	2016-04-29 23:25:49 +1000	[diff] [blame]	67	BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE);
Aneesh Kumar K.V	eee24b5	2016-04-29 23:25:44 +1000	[diff] [blame]	68	if (slab_is_available()) {
				69	pgdp = pgd_offset_k(ea);
				70	pudp = pud_alloc(&init_mm, pgdp, ea);
				71	if (!pudp)
				72	return -ENOMEM;
				73	pmdp = pmd_alloc(&init_mm, pudp, ea);
				74	if (!pmdp)
				75	return -ENOMEM;
				76	ptep = pte_alloc_kernel(pmdp, ea);
				77	if (!ptep)
				78	return -ENOMEM;
				79	set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
				80	__pgprot(flags)));
				81	} else {
				82	/*
				83	* If the mm subsystem is not fully up, we cannot create a
				84	* linux page table entry for this mapping. Simply bolt an
				85	* entry in the hardware page table.
				86	*
				87	*/
				88	if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
				89	mmu_io_psize, mmu_kernel_ssize)) {
				90	printk(KERN_ERR "Failed to do bolted mapping IO "
				91	"memory at %016lx !\n", pa);
				92	return -ENOMEM;
				93	}
				94	}
				95
				96	smp_wmb();
				97	return 0;
				98	}
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	99
				100	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				101
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	102	unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				103	pmd_t *pmdp, unsigned long clr,
				104	unsigned long set)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	105	{
				106	__be64 old_be, tmp;
				107	unsigned long old;
				108
				109	#ifdef CONFIG_DEBUG_VM
				110	WARN_ON(!pmd_trans_huge(*pmdp));
				111	assert_spin_locked(&mm->page_table_lock);
				112	#endif
				113
				114	__asm__ __volatile__(
				115	"1: ldarx %0,0,%3\n\
				116	and. %1,%0,%6\n\
				117	bne- 1b \n\
				118	andc %1,%0,%4 \n\
				119	or %1,%1,%7\n\
				120	stdcx. %1,0,%3 \n\
				121	bne- 1b"
				122	: "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp)
				123	: "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp),
				124	"r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set))
				125	: "cc" );
				126
				127	old = be64_to_cpu(old_be);
				128
				129	trace_hugepage_update(addr, old, clr, set);
				130	if (old & H_PAGE_HASHPTE)
				131	hpte_do_hugepage_flush(mm, addr, pmdp, old);
				132	return old;
				133	}
				134
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	135	pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
				136	pmd_t *pmdp)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	137	{
				138	pmd_t pmd;
				139
				140	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
				141	VM_BUG_ON(pmd_trans_huge(*pmdp));
				142
				143	pmd = *pmdp;
				144	pmd_clear(pmdp);
				145	/*
				146	* Wait for all pending hash_page to finish. This is needed
				147	* in case of subpage collapse. When we collapse normal pages
				148	* to hugepage, we first clear the pmd, then invalidate all
				149	* the PTE entries. The assumption here is that any low level
				150	* page fault will see a none pmd and take the slow path that
				151	* will wait on mmap_sem. But we could very well be in a
				152	* hash_page with local ptep pointer value. Such a hash page
				153	* can result in adding new HPTE entries for normal subpages.
				154	* That means we could be modifying the page content as we
				155	* copy them to a huge page. So wait for parallel hash_page
				156	* to finish before invalidating HPTE entries. We can do this
				157	* by sending an IPI to all the cpus and executing a dummy
				158	* function there.
				159	*/
				160	kick_all_cpus_sync();
				161	/*
				162	* Now invalidate the hpte entries in the range
				163	* covered by pmd. This make sure we take a
				164	* fault and will find the pmd as none, which will
				165	* result in a major fault which takes mmap_sem and
				166	* hence wait for collapse to complete. Without this
				167	* the __collapse_huge_page_copy can result in copying
				168	* the old content.
				169	*/
				170	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
				171	return pmd;
				172	}
				173
				174	/*
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	175	* We want to put the pgtable in pmd and use pgtable for tracking
				176	* the base page size hptes
				177	*/
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	178	void hash__pgtable_trans_huge_deposit(struct mm_struct mm, pmd_t pmdp,
				179	pgtable_t pgtable)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	180	{
				181	pgtable_t *pgtable_slot;
				182	assert_spin_locked(&mm->page_table_lock);
				183	/*
				184	* we store the pgtable in the second half of PMD
				185	*/
				186	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				187	*pgtable_slot = pgtable;
				188	/*
				189	* expose the deposited pgtable to other cpus.
				190	* before we set the hugepage PTE at pmd level
				191	* hash fault code looks at the deposted pgtable
				192	* to store hash index values.
				193	*/
				194	smp_wmb();
				195	}
				196
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	197	pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct mm, pmd_t pmdp)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	198	{
				199	pgtable_t pgtable;
				200	pgtable_t *pgtable_slot;
				201
				202	assert_spin_locked(&mm->page_table_lock);
				203	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				204	pgtable = *pgtable_slot;
				205	/*
				206	* Once we withdraw, mark the entry NULL.
				207	*/
				208	*pgtable_slot = NULL;
				209	/*
				210	* We store HPTE information in the deposited PTE fragment.
				211	* zero out the content on withdraw.
				212	*/
				213	memset(pgtable, 0, PTE_FRAG_SIZE);
				214	return pgtable;
				215	}
				216
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	217	void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
				218	unsigned long address, pmd_t *pmdp)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	219	{
				220	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
				221	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
				222
				223	/*
				224	* We can't mark the pmd none here, because that will cause a race
				225	* against exit_mmap. We need to continue mark pmd TRANS HUGE, while
				226	* we spilt, but at the same time we wan't rest of the ppc64 code
				227	* not to insert hash pte on this, because we will be modifying
				228	* the deposited pgtable in the caller of this function. Hence
				229	* clear the _PAGE_USER so that we move the fault handling to
				230	* higher level function and that will serialize against ptl.
				231	* We need to flush existing hash pte entries here even though,
				232	* the translation is still valid, because we will withdraw
				233	* pgtable_t after this.
				234	*/
				235	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
				236	}
				237
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	238	/*
				239	* A linux hugepage PMD was changed and the corresponding hash table entries
				240	* neesd to be flushed.
				241	*/
				242	void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
				243	pmd_t *pmdp, unsigned long old_pmd)
				244	{
				245	int ssize;
				246	unsigned int psize;
				247	unsigned long vsid;
				248	unsigned long flags = 0;
				249	const struct cpumask *tmp;
				250
				251	/* get the base page size,vsid and segment size */
				252	#ifdef CONFIG_DEBUG_VM
				253	psize = get_slice_psize(mm, addr);
				254	BUG_ON(psize == MMU_PAGE_16M);
				255	#endif
				256	if (old_pmd & H_PAGE_COMBO)
				257	psize = MMU_PAGE_4K;
				258	else
				259	psize = MMU_PAGE_64K;
				260
				261	if (!is_kernel_addr(addr)) {
				262	ssize = user_segment_size(addr);
				263	vsid = get_vsid(mm->context.id, addr, ssize);
				264	WARN_ON(vsid == 0);
				265	} else {
				266	vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
				267	ssize = mmu_kernel_ssize;
				268	}
				269
				270	tmp = cpumask_of(smp_processor_id());
				271	if (cpumask_equal(mm_cpumask(mm), tmp))
				272	flags \|= HPTE_LOCAL_UPDATE;
				273
				274	return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags);
				275	}
				276
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	277	pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
				278	unsigned long addr, pmd_t *pmdp)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	279	{
				280	pmd_t old_pmd;
				281	pgtable_t pgtable;
				282	unsigned long old;
				283	pgtable_t *pgtable_slot;
				284
				285	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
				286	old_pmd = __pmd(old);
				287	/*
				288	* We have pmd == none and we are holding page_table_lock.
				289	* So we can safely go and clear the pgtable hash
				290	* index info.
				291	*/
				292	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				293	pgtable = *pgtable_slot;
				294	/*
				295	* Let's zero out old valid and hash index details
				296	* hash fault look at them.
				297	*/
				298	memset(pgtable, 0, PTE_FRAG_SIZE);
				299	/*
				300	* Serialize against find_linux_pte_or_hugepte which does lock-less
				301	* lookup in page tables with local interrupts disabled. For huge pages
				302	* it casts pmd_t to pte_t. Since format of pte_t is different from
				303	* pmd_t we want to prevent transit from pmd pointing to page table
				304	* to pmd pointing to huge page (and back) while interrupts are disabled.
				305	* We clear pmd to possibly replace it with page table pointer in
				306	* different code paths. So make sure we wait for the parallel
				307	* find_linux_pte_or_hugepage to finish.
				308	*/
				309	kick_all_cpus_sync();
				310	return old_pmd;
				311	}
				312
Aneesh Kumar K.V	3df33f1	2016-04-29 23:26:29 +1000	[diff] [blame]	313	int hash__has_transparent_hugepage(void)
Aneesh Kumar K.V	6a1ea36	2016-04-29 23:26:28 +1000	[diff] [blame]	314	{
				315
				316	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
				317	return 0;
				318	/*
				319	* We support THP only if PMD_SIZE is 16MB.
				320	*/
				321	if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT)
				322	return 0;
				323	/*
				324	* We need to make sure that we support 16MB hugepage in a segement
				325	* with base page size 64K or 4K. We only enable THP with a PAGE_SIZE
				326	* of 64K.
				327	*/
				328	/*
				329	* If we have 64K HPTE, we will be using that by default
				330	*/
				331	if (mmu_psize_defs[MMU_PAGE_64K].shift &&
				332	(mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1))
				333	return 0;
				334	/*
				335	* Ok we only have 4K HPTE
				336	*/
				337	if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1)
				338	return 0;
				339
				340	return 1;
				341	}
				342	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */