s390/mm: split arch/s390/mm/pgtable.c

The pgtable.c file is quite big, before it grows any larger split it
into pgtable.c, pgalloc.c and gmap.c. In addition move the gmap related
header definitions into the new gmap.h header and all of the pgste
helpers from pgtable.h to pgtable.c.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
new file mode 100644
index 0000000..f6c3de2
--- /dev/null
+++ b/arch/s390/mm/pgalloc.c
@@ -0,0 +1,360 @@
+/*
+ *  Page table allocation functions
+ *
+ *    Copyright IBM Corp. 2016
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
+#include <asm/gmap.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+#ifdef CONFIG_PGSTE
+
+static int page_table_allocate_pgste_min = 0;
+static int page_table_allocate_pgste_max = 1;
+int page_table_allocate_pgste = 0;
+EXPORT_SYMBOL(page_table_allocate_pgste);
+
+static struct ctl_table page_table_sysctl[] = {
+	{
+		.procname	= "allocate_pgste",
+		.data		= &page_table_allocate_pgste,
+		.maxlen		= sizeof(int),
+		.mode		= S_IRUGO | S_IWUSR,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &page_table_allocate_pgste_min,
+		.extra2		= &page_table_allocate_pgste_max,
+	},
+	{ }
+};
+
+static struct ctl_table page_table_sysctl_dir[] = {
+	{
+		.procname	= "vm",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= page_table_sysctl,
+	},
+	{ }
+};
+
+static int __init page_table_register_sysctl(void)
+{
+	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
+}
+__initcall(page_table_register_sysctl);
+
+#endif /* CONFIG_PGSTE */
+
+unsigned long *crst_table_alloc(struct mm_struct *mm)
+{
+	struct page *page = alloc_pages(GFP_KERNEL, 2);
+
+	if (!page)
+		return NULL;
+	return (unsigned long *) page_to_phys(page);
+}
+
+void crst_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	free_pages((unsigned long) table, 2);
+}
+
+static void __crst_table_upgrade(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		clear_user_asce();
+		set_user_asce(mm);
+	}
+	__tlb_flush_local();
+}
+
+int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
+{
+	unsigned long *table, *pgd;
+	unsigned long entry;
+	int flush;
+
+	BUG_ON(limit > TASK_MAX_SIZE);
+	flush = 0;
+repeat:
+	table = crst_table_alloc(mm);
+	if (!table)
+		return -ENOMEM;
+	spin_lock_bh(&mm->page_table_lock);
+	if (mm->context.asce_limit < limit) {
+		pgd = (unsigned long *) mm->pgd;
+		if (mm->context.asce_limit <= (1UL << 31)) {
+			entry = _REGION3_ENTRY_EMPTY;
+			mm->context.asce_limit = 1UL << 42;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION3;
+		} else {
+			entry = _REGION2_ENTRY_EMPTY;
+			mm->context.asce_limit = 1UL << 53;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION2;
+		}
+		crst_table_init(table, entry);
+		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
+		mm->pgd = (pgd_t *) table;
+		mm->task_size = mm->context.asce_limit;
+		table = NULL;
+		flush = 1;
+	}
+	spin_unlock_bh(&mm->page_table_lock);
+	if (table)
+		crst_table_free(mm, table);
+	if (mm->context.asce_limit < limit)
+		goto repeat;
+	if (flush)
+		on_each_cpu(__crst_table_upgrade, mm, 0);
+	return 0;
+}
+
+void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
+{
+	pgd_t *pgd;
+
+	if (current->active_mm == mm) {
+		clear_user_asce();
+		__tlb_flush_mm(mm);
+	}
+	while (mm->context.asce_limit > limit) {
+		pgd = mm->pgd;
+		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
+		case _REGION_ENTRY_TYPE_R2:
+			mm->context.asce_limit = 1UL << 42;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_REGION3;
+			break;
+		case _REGION_ENTRY_TYPE_R3:
+			mm->context.asce_limit = 1UL << 31;
+			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+						_ASCE_USER_BITS |
+						_ASCE_TYPE_SEGMENT;
+			break;
+		default:
+			BUG();
+		}
+		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
+		mm->task_size = mm->context.asce_limit;
+		crst_table_free(mm, (unsigned long *) pgd);
+	}
+	if (current->active_mm == mm)
+		set_user_asce(mm);
+}
+
+static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
+{
+	unsigned int old, new;
+
+	do {
+		old = atomic_read(v);
+		new = old ^ bits;
+	} while (atomic_cmpxchg(v, old, new) != old);
+	return new;
+}
+
+/*
+ * page table entry allocation/free routines.
+ */
+unsigned long *page_table_alloc(struct mm_struct *mm)
+{
+	unsigned long *table;
+	struct page *page;
+	unsigned int mask, bit;
+
+	/* Try to get a fragment of a 4K page as a 2K page table */
+	if (!mm_alloc_pgste(mm)) {
+		table = NULL;
+		spin_lock_bh(&mm->context.list_lock);
+		if (!list_empty(&mm->context.pgtable_list)) {
+			page = list_first_entry(&mm->context.pgtable_list,
+						struct page, lru);
+			mask = atomic_read(&page->_mapcount);
+			mask = (mask | (mask >> 4)) & 3;
+			if (mask != 3) {
+				table = (unsigned long *) page_to_phys(page);
+				bit = mask & 1;		/* =1 -> second 2K */
+				if (bit)
+					table += PTRS_PER_PTE;
+				atomic_xor_bits(&page->_mapcount, 1U << bit);
+				list_del(&page->lru);
+			}
+		}
+		spin_unlock_bh(&mm->context.list_lock);
+		if (table)
+			return table;
+	}
+	/* Allocate a fresh page */
+	page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+	if (!page)
+		return NULL;
+	if (!pgtable_page_ctor(page)) {
+		__free_page(page);
+		return NULL;
+	}
+	/* Initialize page table */
+	table = (unsigned long *) page_to_phys(page);
+	if (mm_alloc_pgste(mm)) {
+		/* Return 4K page table with PGSTEs */
+		atomic_set(&page->_mapcount, 3);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+		clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+	} else {
+		/* Return the first 2K fragment of the page */
+		atomic_set(&page->_mapcount, 1);
+		clear_table(table, _PAGE_INVALID, PAGE_SIZE);
+		spin_lock_bh(&mm->context.list_lock);
+		list_add(&page->lru, &mm->context.pgtable_list);
+		spin_unlock_bh(&mm->context.list_lock);
+	}
+	return table;
+}
+
+void page_table_free(struct mm_struct *mm, unsigned long *table)
+{
+	struct page *page;
+	unsigned int bit, mask;
+
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (!mm_alloc_pgste(mm)) {
+		/* Free 2K page table fragment of a 4K page */
+		bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
+		spin_lock_bh(&mm->context.list_lock);
+		mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
+		if (mask & 3)
+			list_add(&page->lru, &mm->context.pgtable_list);
+		else
+			list_del(&page->lru);
+		spin_unlock_bh(&mm->context.list_lock);
+		if (mask != 0)
+			return;
+	}
+
+	pgtable_page_dtor(page);
+	atomic_set(&page->_mapcount, -1);
+	__free_page(page);
+}
+
+void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
+			 unsigned long vmaddr)
+{
+	struct mm_struct *mm;
+	struct page *page;
+	unsigned int bit, mask;
+
+	mm = tlb->mm;
+	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+	if (mm_alloc_pgste(mm)) {
+		gmap_unlink(mm, table, vmaddr);
+		table = (unsigned long *) (__pa(table) | 3);
+		tlb_remove_table(tlb, table);
+		return;
+	}
+	bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
+	spin_lock_bh(&mm->context.list_lock);
+	mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
+	if (mask & 3)
+		list_add_tail(&page->lru, &mm->context.pgtable_list);
+	else
+		list_del(&page->lru);
+	spin_unlock_bh(&mm->context.list_lock);
+	table = (unsigned long *) (__pa(table) | (1U << bit));
+	tlb_remove_table(tlb, table);
+}
+
+static void __tlb_remove_table(void *_table)
+{
+	unsigned int mask = (unsigned long) _table & 3;
+	void *table = (void *)((unsigned long) _table ^ mask);
+	struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+
+	switch (mask) {
+	case 0:		/* pmd or pud */
+		free_pages((unsigned long) table, 2);
+		break;
+	case 1:		/* lower 2K of a 4K page table */
+	case 2:		/* higher 2K of a 4K page table */
+		if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0)
+			break;
+		/* fallthrough */
+	case 3:		/* 4K page table with pgstes */
+		pgtable_page_dtor(page);
+		atomic_set(&page->_mapcount, -1);
+		__free_page(page);
+		break;
+	}
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+	/* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+	/*
+	 * This isn't an RCU grace period and hence the page-tables cannot be
+	 * assumed to be actually RCU-freed.
+	 *
+	 * It is however sufficient for software page-table walkers that rely
+	 * on IRQ disabling. See the comment near struct mmu_table_batch.
+	 */
+	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+	__tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+	struct mmu_table_batch *batch;
+	int i;
+
+	batch = container_of(head, struct mmu_table_batch, rcu);
+
+	for (i = 0; i < batch->nr; i++)
+		__tlb_remove_table(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+	struct mmu_table_batch **batch = &tlb->batch;
+
+	if (*batch) {
+		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+		*batch = NULL;
+	}
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+	struct mmu_table_batch **batch = &tlb->batch;
+
+	tlb->mm->context.flush_mm = 1;
+	if (*batch == NULL) {
+		*batch = (struct mmu_table_batch *)
+			__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+		if (*batch == NULL) {
+			__tlb_flush_mm_lazy(tlb->mm);
+			tlb_remove_table_one(table);
+			return;
+		}
+		(*batch)->nr = 0;
+	}
+	(*batch)->tables[(*batch)->nr++] = table;
+	if ((*batch)->nr == MAX_TABLE_BATCH)
+		tlb_flush_mmu(tlb);
+}