Blame - arch/powerpc/mm/pgtable_64.c - kernel/msm-5.4

blob: e4d3e9fb59be71832cd7541218c0b3045108e3f0 [file] [log] [blame]

Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	1	/*
				2	* This file contains ioremap and related functions for 64-bit machines.
				3	*
				4	* Derived from arch/ppc64/mm/init.c
				5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
				6	*
				7	* Modifications by Paul Mackerras (PowerMac) (paulus@samba.org)
				8	* and Cort Dougan (PReP) (cort@cs.nmt.edu)
				9	* Copyright (C) 1996 Paul Mackerras
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	10	*
				11	* Derived from "arch/i386/mm/init.c"
				12	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				13	*
				14	* Dave Engebretsen <engebret@us.ibm.com>
				15	* Rework for PPC64 port.
				16	*
				17	* This program is free software; you can redistribute it and/or
				18	* modify it under the terms of the GNU General Public License
				19	* as published by the Free Software Foundation; either version
				20	* 2 of the License, or (at your option) any later version.
				21	*
				22	*/
				23
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	24	#include <linux/signal.h>
				25	#include <linux/sched.h>
				26	#include <linux/kernel.h>
				27	#include <linux/errno.h>
				28	#include <linux/string.h>
Paul Gortmaker	66b15db	2011-05-27 10:46:24 -0400	[diff] [blame]	29	#include <linux/export.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	30	#include <linux/types.h>
				31	#include <linux/mman.h>
				32	#include <linux/mm.h>
				33	#include <linux/swap.h>
				34	#include <linux/stddef.h>
				35	#include <linux/vmalloc.h>
				36	#include <linux/init.h>
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	37	#include <linux/bootmem.h>
Yinghai Lu	95f72d1	2010-07-12 14:36:09 +1000	[diff] [blame]	38	#include <linux/memblock.h>
Tejun Heo	5a0e3ad	2010-03-24 17:04:11 +0900	[diff] [blame]	39	#include <linux/slab.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	40
				41	#include <asm/pgalloc.h>
				42	#include <asm/page.h>
				43	#include <asm/prom.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	44	#include <asm/io.h>
				45	#include <asm/mmu_context.h>
				46	#include <asm/pgtable.h>
				47	#include <asm/mmu.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	48	#include <asm/smp.h>
				49	#include <asm/machdep.h>
				50	#include <asm/tlb.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	51	#include <asm/processor.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	52	#include <asm/cputable.h>
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	53	#include <asm/sections.h>
Stephen Rothwell	5e203d6	2006-09-25 13:36:31 +1000	[diff] [blame]	54	#include <asm/firmware.h>
David Gibson	800fc3e	2005-11-16 15:43:48 +1100	[diff] [blame]	55
				56	#include "mmu_decl.h"
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	57
Aneesh Kumar K.V	78f1dbd	2012-09-10 02:52:57 +0000	[diff] [blame]	58	/* Some sanity checking */
				59	#if TASK_SIZE_USER64 > PGTABLE_RANGE
				60	#error TASK_SIZE_USER64 exceeds pagetable range
				61	#endif
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	62
Aneesh Kumar K.V	78f1dbd	2012-09-10 02:52:57 +0000	[diff] [blame]	63	#ifdef CONFIG_PPC_STD_MMU_64
Aneesh Kumar K.V	af81d78	2013-03-13 03:34:55 +0000	[diff] [blame]	64	#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
Aneesh Kumar K.V	78f1dbd	2012-09-10 02:52:57 +0000	[diff] [blame]	65	#error TASK_SIZE_USER64 exceeds user VSID range
				66	#endif
				67	#endif
				68
				69	unsigned long ioremap_bot = IOREMAP_BASE;
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	70
				71	#ifdef CONFIG_PPC_MMU_NOHASH
				72	static void *early_alloc_pgtable(unsigned long size)
				73	{
				74	void *pt;
				75
				76	if (init_bootmem_done)
				77	pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS));
				78	else
Yinghai Lu	95f72d1	2010-07-12 14:36:09 +1000	[diff] [blame]	79	pt = __va(memblock_alloc_base(size, size,
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	80	__pa(MAX_DMA_ADDRESS)));
				81	memset(pt, 0, size);
				82
				83	return pt;
				84	}
				85	#endif /* CONFIG_PPC_MMU_NOHASH */
				86
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	87	/*
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	88	* map_kernel_page currently only called by __ioremap
				89	* map_kernel_page adds an entry to the ioremap page table
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	90	* and adds an entry to the HPT, possibly bolting it
				91	*/
Benjamin Herrenschmidt	32a7494	2009-07-23 23:15:58 +0000	[diff] [blame]	92	int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	93	{
				94	pgd_t *pgdp;
				95	pud_t *pudp;
				96	pmd_t *pmdp;
				97	pte_t *ptep;
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	98
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	99	if (slab_is_available()) {
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	100	pgdp = pgd_offset_k(ea);
				101	pudp = pud_alloc(&init_mm, pgdp, ea);
				102	if (!pudp)
				103	return -ENOMEM;
				104	pmdp = pmd_alloc(&init_mm, pudp, ea);
				105	if (!pmdp)
				106	return -ENOMEM;
Paul Mackerras	23fd077	2005-10-31 13:37:12 +1100	[diff] [blame]	107	ptep = pte_alloc_kernel(pmdp, ea);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	108	if (!ptep)
				109	return -ENOMEM;
				110	set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
				111	__pgprot(flags)));
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	112	} else {
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	113	#ifdef CONFIG_PPC_MMU_NOHASH
				114	/* Warning ! This will blow up if bootmem is not initialized
				115	* which our ppc64 code is keen to do that, we'll need to
				116	* fix it and/or be more careful
				117	*/
				118	pgdp = pgd_offset_k(ea);
				119	#ifdef PUD_TABLE_SIZE
				120	if (pgd_none(*pgdp)) {
				121	pudp = early_alloc_pgtable(PUD_TABLE_SIZE);
				122	BUG_ON(pudp == NULL);
				123	pgd_populate(&init_mm, pgdp, pudp);
				124	}
				125	#endif /* PUD_TABLE_SIZE */
				126	pudp = pud_offset(pgdp, ea);
				127	if (pud_none(*pudp)) {
				128	pmdp = early_alloc_pgtable(PMD_TABLE_SIZE);
				129	BUG_ON(pmdp == NULL);
				130	pud_populate(&init_mm, pudp, pmdp);
				131	}
				132	pmdp = pmd_offset(pudp, ea);
				133	if (!pmd_present(*pmdp)) {
				134	ptep = early_alloc_pgtable(PAGE_SIZE);
				135	BUG_ON(ptep == NULL);
				136	pmd_populate_kernel(&init_mm, pmdp, ptep);
				137	}
				138	ptep = pte_offset_kernel(pmdp, ea);
				139	set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT,
				140	__pgprot(flags)));
				141	#else /* CONFIG_PPC_MMU_NOHASH */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	142	/*
				143	* If the mm subsystem is not fully up, we cannot create a
				144	* linux page table entry for this mapping. Simply bolt an
				145	* entry in the hardware page table.
Benjamin Herrenschmidt	3c726f8	2005-11-07 11:06:55 +1100	[diff] [blame]	146	*
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	147	*/
Paul Mackerras	1189be6	2007-10-11 20:37:10 +1000	[diff] [blame]	148	if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags,
				149	mmu_io_psize, mmu_kernel_ssize)) {
Benjamin Herrenschmidt	77ac166	2005-11-10 11:12:11 +1100	[diff] [blame]	150	printk(KERN_ERR "Failed to do bolted mapping IO "
				151	"memory at %016lx !\n", pa);
				152	return -ENOMEM;
				153	}
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	154	#endif /* !CONFIG_PPC_MMU_NOHASH */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	155	}
				156	return 0;
				157	}
				158
				159
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	160	/**
				161	* __ioremap_at - Low level function to establish the page tables
				162	* for an IO mapping
				163	*/
				164	void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	165	unsigned long flags)
				166	{
				167	unsigned long i;
				168
Benjamin Herrenschmidt	a1f242f	2008-07-23 21:27:08 -0700	[diff] [blame]	169	/* Make sure we have the base flags */
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	170	if ((flags & _PAGE_PRESENT) == 0)
				171	flags \|= pgprot_val(PAGE_KERNEL);
				172
Benjamin Herrenschmidt	a1f242f	2008-07-23 21:27:08 -0700	[diff] [blame]	173	/* Non-cacheable page cannot be coherent */
				174	if (flags & _PAGE_NO_CACHE)
				175	flags &= ~_PAGE_COHERENT;
				176
				177	/* We don't support the 4K PFN hack with ioremap */
				178	if (flags & _PAGE_4K_PFN)
				179	return NULL;
				180
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	181	WARN_ON(pa & ~PAGE_MASK);
				182	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
				183	WARN_ON(size & ~PAGE_MASK);
				184
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	185	for (i = 0; i < size; i += PAGE_SIZE)
Benjamin Herrenschmidt	a245067	2009-07-23 23:15:16 +0000	[diff] [blame]	186	if (map_kernel_page((unsigned long)ea+i, pa+i, flags))
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	187	return NULL;
				188
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	189	return (void __iomem *)ea;
				190	}
				191
				192	/**
				193	* __iounmap_from - Low level function to tear down the page tables
				194	* for an IO mapping. This is used for mappings that
				195	* are manipulated manually, like partial unmapping of
				196	* PCI IOs or ISA space.
				197	*/
				198	void __iounmap_at(void *ea, unsigned long size)
				199	{
				200	WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
				201	WARN_ON(size & ~PAGE_MASK);
				202
				203	unmap_kernel_range((unsigned long)ea, size);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	204	}
				205
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	206	void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size,
				207	unsigned long flags, void *caller)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	208	{
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	209	phys_addr_t paligned;
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	210	void __iomem *ret;
				211
				212	/*
				213	* Choose an address to map it to.
				214	* Once the imalloc system is running, we use it.
				215	* Before that, we map using addresses going
				216	* up from ioremap_bot. imalloc will use
				217	* the addresses from ioremap_bot through
				218	* IMALLOC_END
				219	*
				220	*/
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	221	paligned = addr & PAGE_MASK;
				222	size = PAGE_ALIGN(addr + size) - paligned;
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	223
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	224	if ((size == 0) \|\| (paligned == 0))
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	225	return NULL;
				226
				227	if (mem_init_done) {
				228	struct vm_struct *area;
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	229
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	230	area = __get_vm_area_caller(size, VM_IOREMAP,
				231	ioremap_bot, IOREMAP_END,
				232	caller);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	233	if (area == NULL)
				234	return NULL;
Michael Ellerman	7a9d125	2010-11-28 18:26:36 +0000	[diff] [blame]	235
				236	area->phys_addr = paligned;
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	237	ret = __ioremap_at(paligned, area->addr, size, flags);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	238	if (!ret)
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	239	vunmap(area->addr);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	240	} else {
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	241	ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	242	if (ret)
				243	ioremap_bot += size;
				244	}
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	245
				246	if (ret)
				247	ret += addr & ~PAGE_MASK;
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	248	return ret;
				249	}
				250
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	251	void __iomem * __ioremap(phys_addr_t addr, unsigned long size,
				252	unsigned long flags)
				253	{
				254	return __ioremap_caller(addr, size, flags, __builtin_return_address(0));
				255	}
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	256
Benjamin Herrenschmidt	68a6435	2006-11-13 09:27:39 +1100	[diff] [blame]	257	void __iomem * ioremap(phys_addr_t addr, unsigned long size)
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	258	{
				259	unsigned long flags = _PAGE_NO_CACHE \| _PAGE_GUARDED;
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	260	void *caller = __builtin_return_address(0);
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	261
				262	if (ppc_md.ioremap)
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	263	return ppc_md.ioremap(addr, size, flags, caller);
				264	return __ioremap_caller(addr, size, flags, caller);
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	265	}
				266
Anton Blanchard	be135f4	2011-05-08 21:41:59 +0000	[diff] [blame]	267	void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size)
				268	{
				269	unsigned long flags = _PAGE_NO_CACHE;
				270	void *caller = __builtin_return_address(0);
				271
				272	if (ppc_md.ioremap)
				273	return ppc_md.ioremap(addr, size, flags, caller);
				274	return __ioremap_caller(addr, size, flags, caller);
				275	}
				276
Anton Blanchard	40f1ce7	2011-05-08 21:43:47 +0000	[diff] [blame]	277	void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size,
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	278	unsigned long flags)
				279	{
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	280	void *caller = __builtin_return_address(0);
				281
Benjamin Herrenschmidt	a1f242f	2008-07-23 21:27:08 -0700	[diff] [blame]	282	/* writeable implies dirty for kernel addresses */
				283	if (flags & _PAGE_RW)
				284	flags \|= _PAGE_DIRTY;
				285
				286	/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
				287	flags &= ~(_PAGE_USER \| _PAGE_EXEC);
				288
Benjamin Herrenschmidt	55052ee	2010-04-07 14:39:36 +1000	[diff] [blame]	289	#ifdef _PAGE_BAP_SR
				290	/* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format
				291	* which means that we just cleared supervisor access... oops ;-) This
				292	* restores it
				293	*/
				294	flags \|= _PAGE_BAP_SR;
				295	#endif
				296
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	297	if (ppc_md.ioremap)
Benjamin Herrenschmidt	1cdab55	2009-02-22 16:19:14 +0000	[diff] [blame]	298	return ppc_md.ioremap(addr, size, flags, caller);
				299	return __ioremap_caller(addr, size, flags, caller);
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	300	}
				301
				302
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	303	/*
				304	* Unmap an IO region and remove it from imalloc'd list.
				305	* Access to IO memory should be serialized by driver.
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	306	*/
Benjamin Herrenschmidt	68a6435	2006-11-13 09:27:39 +1100	[diff] [blame]	307	void __iounmap(volatile void __iomem *token)
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	308	{
				309	void *addr;
				310
				311	if (!mem_init_done)
				312	return;
				313
Benjamin Herrenschmidt	3d5134e	2007-06-04 15:15:36 +1000	[diff] [blame]	314	addr = (void *) ((unsigned long __force)
				315	PCI_FIX_ADDR(token) & PAGE_MASK);
				316	if ((unsigned long)addr < ioremap_bot) {
				317	printk(KERN_WARNING "Attempt to iounmap early bolted mapping"
				318	" at 0x%p\n", addr);
				319	return;
				320	}
				321	vunmap(addr);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	322	}
				323
Benjamin Herrenschmidt	68a6435	2006-11-13 09:27:39 +1100	[diff] [blame]	324	void iounmap(volatile void __iomem *token)
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	325	{
				326	if (ppc_md.iounmap)
				327	ppc_md.iounmap(token);
				328	else
				329	__iounmap(token);
				330	}
				331
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	332	EXPORT_SYMBOL(ioremap);
Anton Blanchard	be135f4	2011-05-08 21:41:59 +0000	[diff] [blame]	333	EXPORT_SYMBOL(ioremap_wc);
Anton Blanchard	40f1ce7	2011-05-08 21:43:47 +0000	[diff] [blame]	334	EXPORT_SYMBOL(ioremap_prot);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	335	EXPORT_SYMBOL(__ioremap);
Olof Johansson	a302cb9	2007-08-31 13:58:51 +1000	[diff] [blame]	336	EXPORT_SYMBOL(__ioremap_at);
Paul Mackerras	14cf11a	2005-09-26 16:04:21 +1000	[diff] [blame]	337	EXPORT_SYMBOL(iounmap);
Benjamin Herrenschmidt	4cb3cee	2006-11-11 17:25:10 +1100	[diff] [blame]	338	EXPORT_SYMBOL(__iounmap);
Olof Johansson	a302cb9	2007-08-31 13:58:51 +1000	[diff] [blame]	339	EXPORT_SYMBOL(__iounmap_at);
Aneesh Kumar K.V	5c1f6ee	2013-04-28 09:37:33 +0000	[diff] [blame]	340
Aneesh Kumar K.V	074c2ea	2013-06-20 14:30:15 +0530	[diff] [blame^]	341	/*
				342	* For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags
				343	* For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address.
				344	*/
				345	struct page *pmd_page(pmd_t pmd)
				346	{
				347	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				348	if (pmd_trans_huge(pmd))
				349	return pfn_to_page(pmd_pfn(pmd));
				350	#endif
				351	return virt_to_page(pmd_page_vaddr(pmd));
				352	}
				353
Aneesh Kumar K.V	5c1f6ee	2013-04-28 09:37:33 +0000	[diff] [blame]	354	#ifdef CONFIG_PPC_64K_PAGES
				355	static pte_t get_from_cache(struct mm_struct mm)
				356	{
				357	void pte_frag, ret;
				358
				359	spin_lock(&mm->page_table_lock);
				360	ret = mm->context.pte_frag;
				361	if (ret) {
				362	pte_frag = ret + PTE_FRAG_SIZE;
				363	/*
				364	* If we have taken up all the fragments mark PTE page NULL
				365	*/
				366	if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
				367	pte_frag = NULL;
				368	mm->context.pte_frag = pte_frag;
				369	}
				370	spin_unlock(&mm->page_table_lock);
				371	return (pte_t *)ret;
				372	}
				373
				374	static pte_t __alloc_for_cache(struct mm_struct mm, int kernel)
				375	{
				376	void *ret = NULL;
				377	struct page *page = alloc_page(GFP_KERNEL \| __GFP_NOTRACK \|
				378	__GFP_REPEAT \| __GFP_ZERO);
				379	if (!page)
				380	return NULL;
				381
				382	ret = page_address(page);
				383	spin_lock(&mm->page_table_lock);
				384	/*
				385	* If we find pgtable_page set, we return
				386	* the allocated page with single fragement
				387	* count.
				388	*/
				389	if (likely(!mm->context.pte_frag)) {
				390	atomic_set(&page->_count, PTE_FRAG_NR);
				391	mm->context.pte_frag = ret + PTE_FRAG_SIZE;
				392	}
				393	spin_unlock(&mm->page_table_lock);
				394
				395	if (!kernel)
				396	pgtable_page_ctor(page);
				397
				398	return (pte_t *)ret;
				399	}
				400
				401	pte_t page_table_alloc(struct mm_struct mm, unsigned long vmaddr, int kernel)
				402	{
				403	pte_t *pte;
				404
				405	pte = get_from_cache(mm);
				406	if (pte)
				407	return pte;
				408
				409	return __alloc_for_cache(mm, kernel);
				410	}
				411
				412	void page_table_free(struct mm_struct mm, unsigned long table, int kernel)
				413	{
				414	struct page *page = virt_to_page(table);
				415	if (put_page_testzero(page)) {
				416	if (!kernel)
				417	pgtable_page_dtor(page);
				418	free_hot_cold_page(page, 0);
				419	}
				420	}
				421
				422	#ifdef CONFIG_SMP
				423	static void page_table_free_rcu(void *table)
				424	{
				425	struct page *page = virt_to_page(table);
				426	if (put_page_testzero(page)) {
				427	pgtable_page_dtor(page);
				428	free_hot_cold_page(page, 0);
				429	}
				430	}
				431
				432	void pgtable_free_tlb(struct mmu_gather tlb, void table, int shift)
				433	{
				434	unsigned long pgf = (unsigned long)table;
				435
				436	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
				437	pgf \|= shift;
				438	tlb_remove_table(tlb, (void *)pgf);
				439	}
				440
				441	void __tlb_remove_table(void *_table)
				442	{
				443	void table = (void )((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
				444	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
				445
				446	if (!shift)
				447	/* PTE page needs special handling */
				448	page_table_free_rcu(table);
				449	else {
				450	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
				451	kmem_cache_free(PGT_CACHE(shift), table);
				452	}
				453	}
				454	#else
				455	void pgtable_free_tlb(struct mmu_gather tlb, void table, int shift)
				456	{
				457	if (!shift) {
				458	/* PTE page needs special handling */
				459	struct page *page = virt_to_page(table);
				460	if (put_page_testzero(page)) {
				461	pgtable_page_dtor(page);
				462	free_hot_cold_page(page, 0);
				463	}
				464	} else {
				465	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
				466	kmem_cache_free(PGT_CACHE(shift), table);
				467	}
				468	}
				469	#endif
				470	#endif /* CONFIG_PPC_64K_PAGES */
Aneesh Kumar K.V	074c2ea	2013-06-20 14:30:15 +0530	[diff] [blame^]	471
				472	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				473
				474	/*
				475	* This is called when relaxing access to a hugepage. It's also called in the page
				476	* fault path when we don't hit any of the major fault cases, ie, a minor
				477	* update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have
				478	* handled those two for us, we additionally deal with missing execute
				479	* permission here on some processors
				480	*/
				481	int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
				482	pmd_t *pmdp, pmd_t entry, int dirty)
				483	{
				484	int changed;
				485	#ifdef CONFIG_DEBUG_VM
				486	WARN_ON(!pmd_trans_huge(*pmdp));
				487	assert_spin_locked(&vma->vm_mm->page_table_lock);
				488	#endif
				489	changed = !pmd_same(*(pmdp), entry);
				490	if (changed) {
				491	__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
				492	/*
				493	* Since we are not supporting SW TLB systems, we don't
				494	* have any thing similar to flush_tlb_page_nohash()
				495	*/
				496	}
				497	return changed;
				498	}
				499
				500	unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				501	pmd_t *pmdp, unsigned long clr)
				502	{
				503
				504	unsigned long old, tmp;
				505
				506	#ifdef CONFIG_DEBUG_VM
				507	WARN_ON(!pmd_trans_huge(*pmdp));
				508	assert_spin_locked(&mm->page_table_lock);
				509	#endif
				510
				511	#ifdef PTE_ATOMIC_UPDATES
				512	__asm__ __volatile__(
				513	"1: ldarx %0,0,%3\n\
				514	andi. %1,%0,%6\n\
				515	bne- 1b \n\
				516	andc %1,%0,%4 \n\
				517	stdcx. %1,0,%3 \n\
				518	bne- 1b"
				519	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
				520	: "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY)
				521	: "cc" );
				522	#else
				523	old = pmd_val(*pmdp);
				524	*pmdp = __pmd(old & ~clr);
				525	#endif
				526	if (old & _PAGE_HASHPTE)
				527	hpte_do_hugepage_flush(mm, addr, pmdp);
				528	return old;
				529	}
				530
				531	pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
				532	pmd_t *pmdp)
				533	{
				534	pmd_t pmd;
				535
				536	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
				537	if (pmd_trans_huge(*pmdp)) {
				538	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
				539	} else {
				540	/*
				541	* khugepaged calls this for normal pmd
				542	*/
				543	pmd = *pmdp;
				544	pmd_clear(pmdp);
				545	/*
				546	* Wait for all pending hash_page to finish. This is needed
				547	* in case of subpage collapse. When we collapse normal pages
				548	* to hugepage, we first clear the pmd, then invalidate all
				549	* the PTE entries. The assumption here is that any low level
				550	* page fault will see a none pmd and take the slow path that
				551	* will wait on mmap_sem. But we could very well be in a
				552	* hash_page with local ptep pointer value. Such a hash page
				553	* can result in adding new HPTE entries for normal subpages.
				554	* That means we could be modifying the page content as we
				555	* copy them to a huge page. So wait for parallel hash_page
				556	* to finish before invalidating HPTE entries. We can do this
				557	* by sending an IPI to all the cpus and executing a dummy
				558	* function there.
				559	*/
				560	kick_all_cpus_sync();
				561	/*
				562	* Now invalidate the hpte entries in the range
				563	* covered by pmd. This make sure we take a
				564	* fault and will find the pmd as none, which will
				565	* result in a major fault which takes mmap_sem and
				566	* hence wait for collapse to complete. Without this
				567	* the __collapse_huge_page_copy can result in copying
				568	* the old content.
				569	*/
				570	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
				571	}
				572	return pmd;
				573	}
				574
				575	int pmdp_test_and_clear_young(struct vm_area_struct *vma,
				576	unsigned long address, pmd_t *pmdp)
				577	{
				578	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
				579	}
				580
				581	/*
				582	* We currently remove entries from the hashtable regardless of whether
				583	* the entry was young or dirty. The generic routines only flush if the
				584	* entry was young or dirty which is not good enough.
				585	*
				586	* We should be more intelligent about this but for the moment we override
				587	* these functions and force a tlb flush unconditionally
				588	*/
				589	int pmdp_clear_flush_young(struct vm_area_struct *vma,
				590	unsigned long address, pmd_t *pmdp)
				591	{
				592	return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp);
				593	}
				594
				595	/*
				596	* We mark the pmd splitting and invalidate all the hpte
				597	* entries for this hugepage.
				598	*/
				599	void pmdp_splitting_flush(struct vm_area_struct *vma,
				600	unsigned long address, pmd_t *pmdp)
				601	{
				602	unsigned long old, tmp;
				603
				604	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
				605
				606	#ifdef CONFIG_DEBUG_VM
				607	WARN_ON(!pmd_trans_huge(*pmdp));
				608	assert_spin_locked(&vma->vm_mm->page_table_lock);
				609	#endif
				610
				611	#ifdef PTE_ATOMIC_UPDATES
				612
				613	__asm__ __volatile__(
				614	"1: ldarx %0,0,%3\n\
				615	andi. %1,%0,%6\n\
				616	bne- 1b \n\
				617	ori %1,%0,%4 \n\
				618	stdcx. %1,0,%3 \n\
				619	bne- 1b"
				620	: "=&r" (old), "=&r" (tmp), "=m" (*pmdp)
				621	: "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY)
				622	: "cc" );
				623	#else
				624	old = pmd_val(*pmdp);
				625	*pmdp = __pmd(old \| _PAGE_SPLITTING);
				626	#endif
				627	/*
				628	* If we didn't had the splitting flag set, go and flush the
				629	* HPTE entries.
				630	*/
				631	if (!(old & _PAGE_SPLITTING)) {
				632	/* We need to flush the hpte */
				633	if (old & _PAGE_HASHPTE)
				634	hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
				635	}
				636	}
				637
				638	/*
				639	* We want to put the pgtable in pmd and use pgtable for tracking
				640	* the base page size hptes
				641	*/
				642	void pgtable_trans_huge_deposit(struct mm_struct mm, pmd_t pmdp,
				643	pgtable_t pgtable)
				644	{
				645	pgtable_t *pgtable_slot;
				646	assert_spin_locked(&mm->page_table_lock);
				647	/*
				648	* we store the pgtable in the second half of PMD
				649	*/
				650	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				651	*pgtable_slot = pgtable;
				652	/*
				653	* expose the deposited pgtable to other cpus.
				654	* before we set the hugepage PTE at pmd level
				655	* hash fault code looks at the deposted pgtable
				656	* to store hash index values.
				657	*/
				658	smp_wmb();
				659	}
				660
				661	pgtable_t pgtable_trans_huge_withdraw(struct mm_struct mm, pmd_t pmdp)
				662	{
				663	pgtable_t pgtable;
				664	pgtable_t *pgtable_slot;
				665
				666	assert_spin_locked(&mm->page_table_lock);
				667	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				668	pgtable = *pgtable_slot;
				669	/*
				670	* Once we withdraw, mark the entry NULL.
				671	*/
				672	*pgtable_slot = NULL;
				673	/*
				674	* We store HPTE information in the deposited PTE fragment.
				675	* zero out the content on withdraw.
				676	*/
				677	memset(pgtable, 0, PTE_FRAG_SIZE);
				678	return pgtable;
				679	}
				680
				681	/*
				682	* set a new huge pmd. We should not be called for updating
				683	* an existing pmd entry. That should go via pmd_hugepage_update.
				684	*/
				685	void set_pmd_at(struct mm_struct *mm, unsigned long addr,
				686	pmd_t *pmdp, pmd_t pmd)
				687	{
				688	#ifdef CONFIG_DEBUG_VM
				689	WARN_ON(!pmd_none(*pmdp));
				690	assert_spin_locked(&mm->page_table_lock);
				691	WARN_ON(!pmd_trans_huge(pmd));
				692	#endif
				693	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
				694	}
				695
				696	void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
				697	pmd_t *pmdp)
				698	{
				699	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT);
				700	}
				701
				702	/*
				703	* A linux hugepage PMD was changed and the corresponding hash table entries
				704	* neesd to be flushed.
				705	*/
				706	void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
				707	pmd_t *pmdp)
				708	{
				709	int ssize, i;
				710	unsigned long s_addr;
				711	unsigned int psize, valid;
				712	unsigned char *hpte_slot_array;
				713	unsigned long hidx, vpn, vsid, hash, shift, slot;
				714
				715	/*
				716	* Flush all the hptes mapping this hugepage
				717	*/
				718	s_addr = addr & HPAGE_PMD_MASK;
				719	hpte_slot_array = get_hpte_slot_array(pmdp);
				720	/*
				721	* IF we try to do a HUGE PTE update after a withdraw is done.
				722	* we will find the below NULL. This happens when we do
				723	* split_huge_page_pmd
				724	*/
				725	if (!hpte_slot_array)
				726	return;
				727
				728	/* get the base page size */
				729	psize = get_slice_psize(mm, s_addr);
				730	shift = mmu_psize_defs[psize].shift;
				731
				732	for (i = 0; i < (HPAGE_PMD_SIZE >> shift); i++) {
				733	/*
				734	* 8 bits per each hpte entries
				735	* 000\| [ secondary group (one bit) \| hidx (3 bits) \| valid bit]
				736	*/
				737	valid = hpte_valid(hpte_slot_array, i);
				738	if (!valid)
				739	continue;
				740	hidx = hpte_hash_index(hpte_slot_array, i);
				741
				742	/* get the vpn */
				743	addr = s_addr + (i * (1ul << shift));
				744	if (!is_kernel_addr(addr)) {
				745	ssize = user_segment_size(addr);
				746	vsid = get_vsid(mm->context.id, addr, ssize);
				747	WARN_ON(vsid == 0);
				748	} else {
				749	vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
				750	ssize = mmu_kernel_ssize;
				751	}
				752
				753	vpn = hpt_vpn(addr, vsid, ssize);
				754	hash = hpt_hash(vpn, shift, ssize);
				755	if (hidx & _PTEIDX_SECONDARY)
				756	hash = ~hash;
				757
				758	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
				759	slot += hidx & _PTEIDX_GROUP_IX;
				760	ppc_md.hpte_invalidate(slot, vpn, psize,
				761	MMU_PAGE_16M, ssize, 0);
				762	}
				763	}
				764
				765	static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
				766	{
				767	pmd_val(pmd) \|= pgprot_val(pgprot);
				768	return pmd;
				769	}
				770
				771	pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
				772	{
				773	pmd_t pmd;
				774	/*
				775	* For a valid pte, we would have _PAGE_PRESENT or _PAGE_FILE always
				776	* set. We use this to check THP page at pmd level.
				777	* leaf pte for huge page, bottom two bits != 00
				778	*/
				779	pmd_val(pmd) = pfn << PTE_RPN_SHIFT;
				780	pmd_val(pmd) \|= _PAGE_THP_HUGE;
				781	pmd = pmd_set_protbits(pmd, pgprot);
				782	return pmd;
				783	}
				784
				785	pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
				786	{
				787	return pfn_pmd(page_to_pfn(page), pgprot);
				788	}
				789
				790	pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
				791	{
				792
				793	pmd_val(pmd) &= _HPAGE_CHG_MASK;
				794	pmd = pmd_set_protbits(pmd, newprot);
				795	return pmd;
				796	}
				797
				798	/*
				799	* This is called at the end of handling a user page fault, when the
				800	* fault has been handled by updating a HUGE PMD entry in the linux page tables.
				801	* We use it to preload an HPTE into the hash table corresponding to
				802	* the updated linux HUGE PMD entry.
				803	*/
				804	void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
				805	pmd_t *pmd)
				806	{
				807	return;
				808	}
				809
				810	pmd_t pmdp_get_and_clear(struct mm_struct *mm,
				811	unsigned long addr, pmd_t *pmdp)
				812	{
				813	pmd_t old_pmd;
				814	pgtable_t pgtable;
				815	unsigned long old;
				816	pgtable_t *pgtable_slot;
				817
				818	old = pmd_hugepage_update(mm, addr, pmdp, ~0UL);
				819	old_pmd = __pmd(old);
				820	/*
				821	* We have pmd == none and we are holding page_table_lock.
				822	* So we can safely go and clear the pgtable hash
				823	* index info.
				824	*/
				825	pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
				826	pgtable = *pgtable_slot;
				827	/*
				828	* Let's zero out old valid and hash index details
				829	* hash fault look at them.
				830	*/
				831	memset(pgtable, 0, PTE_FRAG_SIZE);
				832	return old_pmd;
				833	}
				834	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */