blob: 28c23140c947990ee67fc3b21246ffaab74cf0f5 [file] [log] [blame]
Chris Metcalf867e3592010-05-28 23:09:12 -04001/*
2 * Copyright 2010 Tilera Corporation. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation, version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
11 * NON INFRINGEMENT. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/sched.h>
16#include <linux/kernel.h>
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/highmem.h>
22#include <linux/slab.h>
23#include <linux/pagemap.h>
24#include <linux/spinlock.h>
25#include <linux/cpumask.h>
26#include <linux/module.h>
27#include <linux/io.h>
28#include <linux/vmalloc.h>
29#include <linux/smp.h>
30
31#include <asm/system.h>
32#include <asm/pgtable.h>
33#include <asm/pgalloc.h>
34#include <asm/fixmap.h>
35#include <asm/tlb.h>
36#include <asm/tlbflush.h>
37#include <asm/homecache.h>
38
39#define K(x) ((x) << (PAGE_SHIFT-10))
40
41/*
42 * The normal show_free_areas() is too verbose on Tile, with dozens
43 * of processors and often four NUMA zones each with high and lowmem.
44 */
45void show_mem(void)
46{
47 struct zone *zone;
48
Chris Metcalf0707ad32010-06-25 17:04:17 -040049 pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
Chris Metcalf867e3592010-05-28 23:09:12 -040050 " free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
51 " pagecache:%lu swap:%lu\n",
52 (global_page_state(NR_ACTIVE_ANON) +
53 global_page_state(NR_ACTIVE_FILE)),
54 (global_page_state(NR_INACTIVE_ANON) +
55 global_page_state(NR_INACTIVE_FILE)),
56 global_page_state(NR_FILE_DIRTY),
57 global_page_state(NR_WRITEBACK),
58 global_page_state(NR_UNSTABLE_NFS),
59 global_page_state(NR_FREE_PAGES),
60 (global_page_state(NR_SLAB_RECLAIMABLE) +
61 global_page_state(NR_SLAB_UNRECLAIMABLE)),
62 global_page_state(NR_FILE_MAPPED),
63 global_page_state(NR_PAGETABLE),
64 global_page_state(NR_BOUNCE),
65 global_page_state(NR_FILE_PAGES),
66 nr_swap_pages);
67
68 for_each_zone(zone) {
69 unsigned long flags, order, total = 0, largest_order = -1;
70
71 if (!populated_zone(zone))
72 continue;
73
Chris Metcalf867e3592010-05-28 23:09:12 -040074 spin_lock_irqsave(&zone->lock, flags);
75 for (order = 0; order < MAX_ORDER; order++) {
76 int nr = zone->free_area[order].nr_free;
77 total += nr << order;
78 if (nr)
79 largest_order = order;
80 }
81 spin_unlock_irqrestore(&zone->lock, flags);
Chris Metcalf0707ad32010-06-25 17:04:17 -040082 pr_err("Node %d %7s: %lukB (largest %luKb)\n",
83 zone_to_nid(zone), zone->name,
Chris Metcalf867e3592010-05-28 23:09:12 -040084 K(total), largest_order ? K(1UL) << largest_order : 0);
85 }
86}
87
88/*
89 * Associate a virtual page frame with a given physical page frame
90 * and protection flags for that frame.
91 */
92static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
93{
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd;
97 pte_t *pte;
98
99 pgd = swapper_pg_dir + pgd_index(vaddr);
100 if (pgd_none(*pgd)) {
101 BUG();
102 return;
103 }
104 pud = pud_offset(pgd, vaddr);
105 if (pud_none(*pud)) {
106 BUG();
107 return;
108 }
109 pmd = pmd_offset(pud, vaddr);
110 if (pmd_none(*pmd)) {
111 BUG();
112 return;
113 }
114 pte = pte_offset_kernel(pmd, vaddr);
115 /* <pfn,flags> stored as-is, to permit clearing entries */
116 set_pte(pte, pfn_pte(pfn, flags));
117
118 /*
119 * It's enough to flush this one mapping.
120 * This appears conservative since it is only called
121 * from __set_fixmap.
122 */
123 local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
124}
125
Chris Metcalf867e3592010-05-28 23:09:12 -0400126void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
127{
128 unsigned long address = __fix_to_virt(idx);
129
130 if (idx >= __end_of_fixed_addresses) {
131 BUG();
132 return;
133 }
134 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
135}
136
137#if defined(CONFIG_HIGHPTE)
138pte_t *_pte_offset_map(pmd_t *dir, unsigned long address, enum km_type type)
139{
140 pte_t *pte = kmap_atomic(pmd_page(*dir), type) +
141 (pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
142 return &pte[pte_index(address)];
143}
144#endif
145
146/*
147 * List of all pgd's needed so it can invalidate entries in both cached
148 * and uncached pgd's. This is essentially codepath-based locking
149 * against pageattr.c; it is the unique case in which a valid change
150 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
151 * vmalloc faults work because attached pagetables are never freed.
152 * The locking scheme was chosen on the basis of manfred's
153 * recommendations and having no core impact whatsoever.
154 * -- wli
155 */
156DEFINE_SPINLOCK(pgd_lock);
157LIST_HEAD(pgd_list);
158
159static inline void pgd_list_add(pgd_t *pgd)
160{
161 list_add(pgd_to_list(pgd), &pgd_list);
162}
163
164static inline void pgd_list_del(pgd_t *pgd)
165{
166 list_del(pgd_to_list(pgd));
167}
168
169#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
170#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
171
172static void pgd_ctor(pgd_t *pgd)
173{
174 unsigned long flags;
175
176 memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
177 spin_lock_irqsave(&pgd_lock, flags);
178
179#ifndef __tilegx__
180 /*
181 * Check that the user interrupt vector has no L2.
182 * It never should for the swapper, and new page tables
183 * should always start with an empty user interrupt vector.
184 */
185 BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
186#endif
187
188 clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
189 swapper_pg_dir + KERNEL_PGD_INDEX_START,
190 KERNEL_PGD_PTRS);
191
192 pgd_list_add(pgd);
193 spin_unlock_irqrestore(&pgd_lock, flags);
194}
195
196static void pgd_dtor(pgd_t *pgd)
197{
198 unsigned long flags; /* can be called from interrupt context */
199
200 spin_lock_irqsave(&pgd_lock, flags);
201 pgd_list_del(pgd);
202 spin_unlock_irqrestore(&pgd_lock, flags);
203}
204
205pgd_t *pgd_alloc(struct mm_struct *mm)
206{
207 pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
208 if (pgd)
209 pgd_ctor(pgd);
210 return pgd;
211}
212
213void pgd_free(struct mm_struct *mm, pgd_t *pgd)
214{
215 pgd_dtor(pgd);
216 kmem_cache_free(pgd_cache, pgd);
217}
218
219
220#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
221
222struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
223{
Chris Metcalf0707ad32010-06-25 17:04:17 -0400224 gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
Chris Metcalf867e3592010-05-28 23:09:12 -0400225 struct page *p;
226
227#ifdef CONFIG_HIGHPTE
228 flags |= __GFP_HIGHMEM;
229#endif
230
231 p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
232 if (p == NULL)
233 return NULL;
234
235 pgtable_page_ctor(p);
236 return p;
237}
238
239/*
240 * Free page immediately (used in __pte_alloc if we raced with another
241 * process). We have to correct whatever pte_alloc_one() did before
242 * returning the pages to the allocator.
243 */
244void pte_free(struct mm_struct *mm, struct page *p)
245{
246 pgtable_page_dtor(p);
247 __free_pages(p, L2_USER_PGTABLE_ORDER);
248}
249
250void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
251 unsigned long address)
252{
253 int i;
254
255 pgtable_page_dtor(pte);
256 tlb->need_flush = 1;
257 if (tlb_fast_mode(tlb)) {
258 struct page *pte_pages[L2_USER_PGTABLE_PAGES];
259 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
260 pte_pages[i] = pte + i;
261 free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
262 return;
263 }
264 for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
265 tlb->pages[tlb->nr++] = pte + i;
266 if (tlb->nr >= FREE_PTE_NR)
267 tlb_flush_mmu(tlb, 0, 0);
268 }
269}
270
271#ifndef __tilegx__
272
273/*
274 * FIXME: needs to be atomic vs hypervisor writes. For now we make the
275 * window of vulnerability a bit smaller by doing an unlocked 8-bit update.
276 */
277int ptep_test_and_clear_young(struct vm_area_struct *vma,
278 unsigned long addr, pte_t *ptep)
279{
280#if HV_PTE_INDEX_ACCESSED < 8 || HV_PTE_INDEX_ACCESSED >= 16
281# error Code assumes HV_PTE "accessed" bit in second byte
282#endif
283 u8 *tmp = (u8 *)ptep;
284 u8 second_byte = tmp[1];
285 if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
286 return 0;
287 tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
288 return 1;
289}
290
291/*
292 * This implementation is atomic vs hypervisor writes, since the hypervisor
293 * always writes the low word (where "accessed" and "dirty" are) and this
294 * routine only writes the high word.
295 */
296void ptep_set_wrprotect(struct mm_struct *mm,
297 unsigned long addr, pte_t *ptep)
298{
299#if HV_PTE_INDEX_WRITABLE < 32
300# error Code assumes HV_PTE "writable" bit in high word
301#endif
302 u32 *tmp = (u32 *)ptep;
303 tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
304}
305
306#endif
307
308pte_t *virt_to_pte(struct mm_struct* mm, unsigned long addr)
309{
310 pgd_t *pgd;
311 pud_t *pud;
312 pmd_t *pmd;
313
314 if (pgd_addr_invalid(addr))
315 return NULL;
316
317 pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
318 pud = pud_offset(pgd, addr);
319 if (!pud_present(*pud))
320 return NULL;
321 pmd = pmd_offset(pud, addr);
322 if (pmd_huge_page(*pmd))
323 return (pte_t *)pmd;
324 if (!pmd_present(*pmd))
325 return NULL;
326 return pte_offset_kernel(pmd, addr);
327}
328
329pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
330{
331 unsigned int width = smp_width;
332 int x = cpu % width;
333 int y = cpu / width;
334 BUG_ON(y >= smp_height);
335 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
336 BUG_ON(cpu < 0 || cpu >= NR_CPUS);
337 BUG_ON(!cpu_is_valid_lotar(cpu));
338 return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
339}
340
341int get_remote_cache_cpu(pgprot_t prot)
342{
343 HV_LOTAR lotar = hv_pte_get_lotar(prot);
344 int x = HV_LOTAR_X(lotar);
345 int y = HV_LOTAR_Y(lotar);
346 BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
347 return x + y * smp_width;
348}
349
350void set_pte_order(pte_t *ptep, pte_t pte, int order)
351{
352 unsigned long pfn = pte_pfn(pte);
353 struct page *page = pfn_to_page(pfn);
354
355 /* Update the home of a PTE if necessary */
356 pte = pte_set_home(pte, page_home(page));
357
358#ifdef __tilegx__
359 *ptep = pte;
360#else
361 /*
362 * When setting a PTE, write the high bits first, then write
363 * the low bits. This sets the "present" bit only after the
364 * other bits are in place. If a particular PTE update
365 * involves transitioning from one valid PTE to another, it
366 * may be necessary to call set_pte_order() more than once,
367 * transitioning via a suitable intermediate state.
368 * Note that this sequence also means that if we are transitioning
369 * from any migrating PTE to a non-migrating one, we will not
370 * see a half-updated PTE with the migrating bit off.
371 */
372#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
373# error Must write the present and migrating bits last
374#endif
375 ((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
376 barrier();
377 ((u32 *)ptep)[0] = (u32)(pte_val(pte));
378#endif
379}
380
381/* Can this mm load a PTE with cached_priority set? */
382static inline int mm_is_priority_cached(struct mm_struct *mm)
383{
384 return mm->context.priority_cached;
385}
386
387/*
388 * Add a priority mapping to an mm_context and
389 * notify the hypervisor if this is the first one.
390 */
391void start_mm_caching(struct mm_struct *mm)
392{
393 if (!mm_is_priority_cached(mm)) {
394 mm->context.priority_cached = -1U;
395 hv_set_caching(-1U);
396 }
397}
398
399/*
400 * Validate and return the priority_cached flag. We know if it's zero
401 * that we don't need to scan, since we immediately set it non-zero
402 * when we first consider a MAP_CACHE_PRIORITY mapping.
403 *
404 * We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
405 * since we're in an interrupt context (servicing switch_mm) we don't
406 * worry about it and don't unset the "priority_cached" field.
407 * Presumably we'll come back later and have more luck and clear
408 * the value then; for now we'll just keep the cache marked for priority.
409 */
410static unsigned int update_priority_cached(struct mm_struct *mm)
411{
412 if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
413 struct vm_area_struct *vm;
414 for (vm = mm->mmap; vm; vm = vm->vm_next) {
415 if (hv_pte_get_cached_priority(vm->vm_page_prot))
416 break;
417 }
418 if (vm == NULL)
419 mm->context.priority_cached = 0;
420 up_write(&mm->mmap_sem);
421 }
422 return mm->context.priority_cached;
423}
424
425/* Set caching correctly for an mm that we are switching to. */
426void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
427{
428 if (!mm_is_priority_cached(next)) {
429 /*
430 * If the new mm doesn't use priority caching, just see if we
431 * need the hv_set_caching(), or can assume it's already zero.
432 */
433 if (mm_is_priority_cached(prev))
434 hv_set_caching(0);
435 } else {
436 hv_set_caching(update_priority_cached(next));
437 }
438}
439
440#if CHIP_HAS_MMIO()
441
442/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
443void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
444 pgprot_t home)
445{
446 void *addr;
447 struct vm_struct *area;
448 unsigned long offset, last_addr;
449 pgprot_t pgprot;
450
451 /* Don't allow wraparound or zero size */
452 last_addr = phys_addr + size - 1;
453 if (!size || last_addr < phys_addr)
454 return NULL;
455
456 /* Create a read/write, MMIO VA mapping homed at the requested shim. */
457 pgprot = PAGE_KERNEL;
458 pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
459 pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
460
461 /*
462 * Mappings have to be page-aligned
463 */
464 offset = phys_addr & ~PAGE_MASK;
465 phys_addr &= PAGE_MASK;
466 size = PAGE_ALIGN(last_addr+1) - phys_addr;
467
468 /*
469 * Ok, go for it..
470 */
471 area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
472 if (!area)
473 return NULL;
474 area->phys_addr = phys_addr;
475 addr = area->addr;
476 if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
477 phys_addr, pgprot)) {
478 remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
479 return NULL;
480 }
481 return (__force void __iomem *) (offset + (char *)addr);
482}
483EXPORT_SYMBOL(ioremap_prot);
484
485/* Map a PCI MMIO bus address into VA space. */
486void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
487{
488 panic("ioremap for PCI MMIO is not supported");
489}
490EXPORT_SYMBOL(ioremap);
491
492/* Unmap an MMIO VA mapping. */
493void iounmap(volatile void __iomem *addr_in)
494{
495 volatile void __iomem *addr = (volatile void __iomem *)
496 (PAGE_MASK & (unsigned long __force)addr_in);
497#if 1
498 vunmap((void * __force)addr);
499#else
500 /* x86 uses this complicated flow instead of vunmap(). Is
501 * there any particular reason we should do the same? */
502 struct vm_struct *p, *o;
503
504 /* Use the vm area unlocked, assuming the caller
505 ensures there isn't another iounmap for the same address
506 in parallel. Reuse of the virtual address is prevented by
507 leaving it in the global lists until we're done with it.
508 cpa takes care of the direct mappings. */
509 read_lock(&vmlist_lock);
510 for (p = vmlist; p; p = p->next) {
511 if (p->addr == addr)
512 break;
513 }
514 read_unlock(&vmlist_lock);
515
516 if (!p) {
Chris Metcalf0707ad32010-06-25 17:04:17 -0400517 pr_err("iounmap: bad address %p\n", addr);
Chris Metcalf867e3592010-05-28 23:09:12 -0400518 dump_stack();
519 return;
520 }
521
522 /* Finally remove it */
523 o = remove_vm_area((void *)addr);
524 BUG_ON(p != o || o == NULL);
525 kfree(p);
526#endif
527}
528EXPORT_SYMBOL(iounmap);
529
530#endif /* CHIP_HAS_MMIO() */