Blame - drivers/misc/sgi-gru/grufault.c - kernel/msm-4.9

blob: 3d33015bbf31a18d2a6aca7c0a662788abc34b97 [file] [log] [blame]

Jack Steiner	1425864	2008-07-29 22:33:57 -0700	[diff] [blame]	1	/*
				2	* SN Platform GRU Driver
				3	*
				4	* FAULT HANDLER FOR GRU DETECTED TLB MISSES
				5	*
				6	* This file contains code that handles TLB misses within the GRU.
				7	* These misses are reported either via interrupts or user polling of
				8	* the user CB.
				9	*
				10	* Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
				11	*
				12	* This program is free software; you can redistribute it and/or modify
				13	* it under the terms of the GNU General Public License as published by
				14	* the Free Software Foundation; either version 2 of the License, or
				15	* (at your option) any later version.
				16	*
				17	* This program is distributed in the hope that it will be useful,
				18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				20	* GNU General Public License for more details.
				21	*
				22	* You should have received a copy of the GNU General Public License
				23	* along with this program; if not, write to the Free Software
				24	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
				25	*/
				26
				27	#include <linux/kernel.h>
				28	#include <linux/errno.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/mm.h>
				31	#include <linux/hugetlb.h>
				32	#include <linux/device.h>
				33	#include <linux/io.h>
				34	#include <linux/uaccess.h>
				35	#include <asm/pgtable.h>
				36	#include "gru.h"
				37	#include "grutables.h"
				38	#include "grulib.h"
				39	#include "gru_instructions.h"
				40	#include <asm/uv/uv_hub.h>
				41
				42	/*
				43	* Test if a physical address is a valid GRU GSEG address
				44	*/
				45	static inline int is_gru_paddr(unsigned long paddr)
				46	{
				47	return paddr >= gru_start_paddr && paddr < gru_end_paddr;
				48	}
				49
				50	/*
				51	* Find the vma of a GRU segment. Caller must hold mmap_sem.
				52	*/
				53	struct vm_area_struct *gru_find_vma(unsigned long vaddr)
				54	{
				55	struct vm_area_struct *vma;
				56
				57	vma = find_vma(current->mm, vaddr);
				58	if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops)
				59	return vma;
				60	return NULL;
				61	}
				62
				63	/*
				64	* Find and lock the gts that contains the specified user vaddr.
				65	*
				66	* Returns:
				67	* - *gts with the mmap_sem locked for read and the GTS locked.
				68	* - NULL if vaddr invalid OR is not a valid GSEG vaddr.
				69	*/
				70
				71	static struct gru_thread_state *gru_find_lock_gts(unsigned long vaddr)
				72	{
				73	struct mm_struct *mm = current->mm;
				74	struct vm_area_struct *vma;
				75	struct gru_thread_state *gts = NULL;
				76
				77	down_read(&mm->mmap_sem);
				78	vma = gru_find_vma(vaddr);
				79	if (vma)
				80	gts = gru_find_thread_state(vma, TSID(vaddr, vma));
				81	if (gts)
				82	mutex_lock(&gts->ts_ctxlock);
				83	else
				84	up_read(&mm->mmap_sem);
				85	return gts;
				86	}
				87
				88	static struct gru_thread_state *gru_alloc_locked_gts(unsigned long vaddr)
				89	{
				90	struct mm_struct *mm = current->mm;
				91	struct vm_area_struct *vma;
				92	struct gru_thread_state *gts = NULL;
				93
				94	down_write(&mm->mmap_sem);
				95	vma = gru_find_vma(vaddr);
				96	if (vma)
				97	gts = gru_alloc_thread_state(vma, TSID(vaddr, vma));
				98	if (gts) {
				99	mutex_lock(&gts->ts_ctxlock);
				100	downgrade_write(&mm->mmap_sem);
				101	} else {
				102	up_write(&mm->mmap_sem);
				103	}
				104
				105	return gts;
				106	}
				107
				108	/*
				109	* Unlock a GTS that was previously locked with gru_find_lock_gts().
				110	*/
				111	static void gru_unlock_gts(struct gru_thread_state *gts)
				112	{
				113	mutex_unlock(&gts->ts_ctxlock);
				114	up_read(&current->mm->mmap_sem);
				115	}
				116
				117	/*
				118	* Set a CB.istatus to active using a user virtual address. This must be done
				119	* just prior to a TFH RESTART. The new cb.istatus is an in-cache status ONLY.
				120	* If the line is evicted, the status may be lost. The in-cache update
				121	* is necessary to prevent the user from seeing a stale cb.istatus that will
				122	* change as soon as the TFH restart is complete. Races may cause an
				123	* occasional failure to clear the cb.istatus, but that is ok.
				124	*
				125	* If the cb address is not valid (should not happen, but...), nothing
				126	* bad will happen.. The get_user()/put_user() will fail but there
				127	* are no bad side-effects.
				128	*/
				129	static void gru_cb_set_istatus_active(unsigned long __user *cb)
				130	{
				131	union {
				132	struct gru_instruction_bits bits;
				133	unsigned long dw;
				134	} u;
				135
				136	if (cb) {
				137	get_user(u.dw, cb);
				138	u.bits.istatus = CBS_ACTIVE;
				139	put_user(u.dw, cb);
				140	}
				141	}
				142
				143	/*
				144	* Convert a interrupt IRQ to a pointer to the GRU GTS that caused the
				145	* interrupt. Interrupts are always sent to a cpu on the blade that contains the
				146	* GRU (except for headless blades which are not currently supported). A blade
				147	* has N grus; a block of N consecutive IRQs is assigned to the GRUs. The IRQ
				148	* number uniquely identifies the GRU chiplet on the local blade that caused the
				149	* interrupt. Always called in interrupt context.
				150	*/
				151	static inline struct gru_state *irq_to_gru(int irq)
				152	{
				153	return &gru_base[uv_numa_blade_id()]->bs_grus[irq - IRQ_GRU];
				154	}
				155
				156	/*
				157	* Read & clear a TFM
				158	*
				159	* The GRU has an array of fault maps. A map is private to a cpu
				160	* Only one cpu will be accessing a cpu's fault map.
				161	*
				162	* This function scans the cpu-private fault map & clears all bits that
				163	* are set. The function returns a bitmap that indicates the bits that
				164	* were cleared. Note that sense the maps may be updated asynchronously by
				165	* the GRU, atomic operations must be used to clear bits.
				166	*/
				167	static void get_clear_fault_map(struct gru_state *gru,
				168	struct gru_tlb_fault_map *map)
				169	{
				170	unsigned long i, k;
				171	struct gru_tlb_fault_map *tfm;
				172
				173	tfm = get_tfm_for_cpu(gru, gru_cpu_fault_map_id());
				174	prefetchw(tfm); /* Helps on hardware, required for emulator */
				175	for (i = 0; i < BITS_TO_LONGS(GRU_NUM_CBE); i++) {
				176	k = tfm->fault_bits[i];
				177	if (k)
				178	k = xchg(&tfm->fault_bits[i], 0UL);
				179	map->fault_bits[i] = k;
				180	}
				181
				182	/*
				183	* Not functionally required but helps performance. (Required
				184	* on emulator)
				185	*/
				186	gru_flush_cache(tfm);
				187	}
				188
				189	/*
				190	* Atomic (interrupt context) & non-atomic (user context) functions to
				191	* convert a vaddr into a physical address. The size of the page
				192	* is returned in pageshift.
				193	* returns:
				194	* 0 - successful
				195	* < 0 - error code
				196	* 1 - (atomic only) try again in non-atomic context
				197	*/
				198	static int non_atomic_pte_lookup(struct vm_area_struct *vma,
				199	unsigned long vaddr, int write,
				200	unsigned long paddr, int pageshift)
				201	{
				202	struct page *page;
				203
				204	/* ZZZ Need to handle HUGE pages */
				205	if (is_vm_hugetlb_page(vma))
				206	return -EFAULT;
				207	*pageshift = PAGE_SHIFT;
				208	if (get_user_pages
				209	(current, current->mm, vaddr, 1, write, 0, &page, NULL) <= 0)
				210	return -EFAULT;
				211	*paddr = page_to_phys(page);
				212	put_page(page);
				213	return 0;
				214	}
				215
				216	/*
				217	*
				218	* atomic_pte_lookup
				219	*
				220	* Convert a user virtual address to a physical address
				221	* Only supports Intel large pages (2MB only) on x86_64.
				222	* ZZZ - hugepage support is incomplete
				223	*/
				224	static int atomic_pte_lookup(struct vm_area_struct *vma, unsigned long vaddr,
				225	int write, unsigned long paddr, int pageshift)
				226	{
				227	pgd_t *pgdp;
				228	pmd_t *pmdp;
				229	pud_t *pudp;
				230	pte_t pte;
				231
				232	WARN_ON(irqs_disabled()); /* ZZZ debug */
				233
				234	local_irq_disable();
				235	pgdp = pgd_offset(vma->vm_mm, vaddr);
				236	if (unlikely(pgd_none(*pgdp)))
				237	goto err;
				238
				239	pudp = pud_offset(pgdp, vaddr);
				240	if (unlikely(pud_none(*pudp)))
				241	goto err;
				242
				243	pmdp = pmd_offset(pudp, vaddr);
				244	if (unlikely(pmd_none(*pmdp)))
				245	goto err;
				246	#ifdef CONFIG_X86_64
				247	if (unlikely(pmd_large(*pmdp)))
				248	pte = (pte_t ) pmdp;
				249	else
				250	#endif
				251	pte = *pte_offset_kernel(pmdp, vaddr);
				252
				253	local_irq_enable();
				254
				255	if (unlikely(!pte_present(pte) \|\|
				256	(write && (!pte_write(pte) \|\| !pte_dirty(pte)))))
				257	return 1;
				258
				259	*paddr = pte_pfn(pte) << PAGE_SHIFT;
				260	*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
				261	return 0;
				262
				263	err:
				264	local_irq_enable();
				265	return 1;
				266	}
				267
				268	/*
				269	* Drop a TLB entry into the GRU. The fault is described by info in an TFH.
				270	* Input:
				271	* cb Address of user CBR. Null if not running in user context
				272	* Return:
				273	* 0 = dropin, exception, or switch to UPM successful
				274	* 1 = range invalidate active
				275	* < 0 = error code
				276	*
				277	*/
				278	static int gru_try_dropin(struct gru_thread_state *gts,
				279	struct gru_tlb_fault_handle *tfh,
				280	unsigned long __user *cb)
				281	{
				282	struct mm_struct *mm = gts->ts_mm;
				283	struct vm_area_struct *vma;
				284	int pageshift, asid, write, ret;
				285	unsigned long paddr, gpa, vaddr;
				286
				287	/*
				288	* NOTE: The GRU contains magic hardware that eliminates races between
				289	* TLB invalidates and TLB dropins. If an invalidate occurs
				290	* in the window between reading the TFH and the subsequent TLB dropin,
				291	* the dropin is ignored. This eliminates the need for additional locks.
				292	*/
				293
				294	/*
				295	* Error if TFH state is IDLE or FMM mode & the user issuing a UPM call.
				296	* Might be a hardware race OR a stupid user. Ignore FMM because FMM
				297	* is a transient state.
				298	*/
				299	if (tfh->state == TFHSTATE_IDLE)
				300	goto failidle;
				301	if (tfh->state == TFHSTATE_MISS_FMM && cb)
				302	goto failfmm;
				303
				304	write = (tfh->cause & TFHCAUSE_TLB_MOD) != 0;
				305	vaddr = tfh->missvaddr;
				306	asid = tfh->missasid;
				307	if (asid == 0)
				308	goto failnoasid;
				309
				310	rmb(); /* TFH must be cache resident before reading ms_range_active */
				311
				312	/*
				313	* TFH is cache resident - at least briefly. Fail the dropin
				314	* if a range invalidate is active.
				315	*/
				316	if (atomic_read(&gts->ts_gms->ms_range_active))
				317	goto failactive;
				318
				319	vma = find_vma(mm, vaddr);
				320	if (!vma)
				321	goto failinval;
				322
				323	/*
				324	* Atomic lookup is faster & usually works even if called in non-atomic
				325	* context.
				326	*/
				327	ret = atomic_pte_lookup(vma, vaddr, write, &paddr, &pageshift);
				328	if (ret) {
				329	if (!cb)
				330	goto failupm;
				331	if (non_atomic_pte_lookup(vma, vaddr, write, &paddr,
				332	&pageshift))
				333	goto failinval;
				334	}
				335	if (is_gru_paddr(paddr))
				336	goto failinval;
				337
				338	paddr = paddr & ~((1UL << pageshift) - 1);
				339	gpa = uv_soc_phys_ram_to_gpa(paddr);
				340	gru_cb_set_istatus_active(cb);
				341	tfh_write_restart(tfh, gpa, GAA_RAM, vaddr, asid, write,
				342	GRU_PAGESIZE(pageshift));
				343	STAT(tlb_dropin);
				344	gru_dbg(grudev,
				345	"%s: tfh 0x%p, vaddr 0x%lx, asid 0x%x, ps %d, gpa 0x%lx\n",
				346	ret ? "non-atomic" : "atomic", tfh, vaddr, asid,
				347	pageshift, gpa);
				348	return 0;
				349
				350	failnoasid:
				351	/* No asid (delayed unload). */
				352	STAT(tlb_dropin_fail_no_asid);
				353	gru_dbg(grudev, "FAILED no_asid tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
				354	if (!cb)
				355	tfh_user_polling_mode(tfh);
				356	else
				357	gru_flush_cache(tfh);
				358	return -EAGAIN;
				359
				360	failupm:
				361	/* Atomic failure switch CBR to UPM */
				362	tfh_user_polling_mode(tfh);
				363	STAT(tlb_dropin_fail_upm);
				364	gru_dbg(grudev, "FAILED upm tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
				365	return 1;
				366
				367	failfmm:
				368	/* FMM state on UPM call */
				369	STAT(tlb_dropin_fail_fmm);
				370	gru_dbg(grudev, "FAILED fmm tfh: 0x%p, state %d\n", tfh, tfh->state);
				371	return 0;
				372
				373	failidle:
				374	/* TFH was idle - no miss pending */
				375	gru_flush_cache(tfh);
				376	if (cb)
				377	gru_flush_cache(cb);
				378	STAT(tlb_dropin_fail_idle);
				379	gru_dbg(grudev, "FAILED idle tfh: 0x%p, state %d\n", tfh, tfh->state);
				380	return 0;
				381
				382	failinval:
				383	/* All errors (atomic & non-atomic) switch CBR to EXCEPTION state */
				384	tfh_exception(tfh);
				385	STAT(tlb_dropin_fail_invalid);
				386	gru_dbg(grudev, "FAILED inval tfh: 0x%p, vaddr 0x%lx\n", tfh, vaddr);
				387	return -EFAULT;
				388
				389	failactive:
				390	/* Range invalidate active. Switch to UPM iff atomic */
				391	if (!cb)
				392	tfh_user_polling_mode(tfh);
				393	else
				394	gru_flush_cache(tfh);
				395	STAT(tlb_dropin_fail_range_active);
				396	gru_dbg(grudev, "FAILED range active: tfh 0x%p, vaddr 0x%lx\n",
				397	tfh, vaddr);
				398	return 1;
				399	}
				400
				401	/*
				402	* Process an external interrupt from the GRU. This interrupt is
				403	* caused by a TLB miss.
				404	* Note that this is the interrupt handler that is registered with linux
				405	* interrupt handlers.
				406	*/
				407	irqreturn_t gru_intr(int irq, void *dev_id)
				408	{
				409	struct gru_state *gru;
				410	struct gru_tlb_fault_map map;
				411	struct gru_thread_state *gts;
				412	struct gru_tlb_fault_handle *tfh = NULL;
				413	int cbrnum, ctxnum;
				414
				415	STAT(intr);
				416
				417	gru = irq_to_gru(irq);
				418	if (!gru) {
				419	dev_err(grudev, "GRU: invalid interrupt: cpu %d, irq %d\n",
				420	raw_smp_processor_id(), irq);
				421	return IRQ_NONE;
				422	}
				423	get_clear_fault_map(gru, &map);
				424	gru_dbg(grudev, "irq %d, gru %x, map 0x%lx\n", irq, gru->gs_gid,
				425	map.fault_bits[0]);
				426
				427	for_each_cbr_in_tfm(cbrnum, map.fault_bits) {
				428	tfh = get_tfh_by_index(gru, cbrnum);
				429	prefetchw(tfh); /* Helps on hdw, required for emulator */
				430
				431	/*
				432	* When hardware sets a bit in the faultmap, it implicitly
				433	* locks the GRU context so that it cannot be unloaded.
				434	* The gts cannot change until a TFH start/writestart command
				435	* is issued.
				436	*/
				437	ctxnum = tfh->ctxnum;
				438	gts = gru->gs_gts[ctxnum];
				439
				440	/*
				441	* This is running in interrupt context. Trylock the mmap_sem.
				442	* If it fails, retry the fault in user context.
				443	*/
				444	if (down_read_trylock(&gts->ts_mm->mmap_sem)) {
				445	gru_try_dropin(gts, tfh, NULL);
				446	up_read(&gts->ts_mm->mmap_sem);
				447	} else {
				448	tfh_user_polling_mode(tfh);
				449	}
				450	}
				451	return IRQ_HANDLED;
				452	}
				453
				454
				455	static int gru_user_dropin(struct gru_thread_state *gts,
				456	struct gru_tlb_fault_handle *tfh,
				457	unsigned long __user *cb)
				458	{
				459	struct gru_mm_struct *gms = gts->ts_gms;
				460	int ret;
				461
				462	while (1) {
				463	wait_event(gms->ms_wait_queue,
				464	atomic_read(&gms->ms_range_active) == 0);
				465	prefetchw(tfh); /* Helps on hdw, required for emulator */
				466	ret = gru_try_dropin(gts, tfh, cb);
				467	if (ret <= 0)
				468	return ret;
				469	STAT(call_os_wait_queue);
				470	}
				471	}
				472
				473	/*
				474	* This interface is called as a result of a user detecting a "call OS" bit
				475	* in a user CB. Normally means that a TLB fault has occurred.
				476	* cb - user virtual address of the CB
				477	*/
				478	int gru_handle_user_call_os(unsigned long cb)
				479	{
				480	struct gru_tlb_fault_handle *tfh;
				481	struct gru_thread_state *gts;
				482	unsigned long __user *cbp;
				483	int ucbnum, cbrnum, ret = -EINVAL;
				484
				485	STAT(call_os);
				486	gru_dbg(grudev, "address 0x%lx\n", cb);
				487
				488	/* sanity check the cb pointer */
				489	ucbnum = get_cb_number((void *)cb);
				490	if ((cb & (GRU_HANDLE_STRIDE - 1)) \|\| ucbnum >= GRU_NUM_CB)
				491	return -EINVAL;
				492	cbp = (unsigned long *)cb;
				493
				494	gts = gru_find_lock_gts(cb);
				495	if (!gts)
				496	return -EINVAL;
				497
				498	if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) {
				499	ret = -EINVAL;
				500	goto exit;
				501	}
				502
				503	/*
				504	* If force_unload is set, the UPM TLB fault is phony. The task
				505	* has migrated to another node and the GSEG must be moved. Just
				506	* unload the context. The task will page fault and assign a new
				507	* context.
				508	*/
				509	ret = -EAGAIN;
				510	cbrnum = thread_cbr_number(gts, ucbnum);
				511	if (gts->ts_force_unload) {
				512	gru_unload_context(gts, 1);
				513	} else if (gts->ts_gru) {
				514	tfh = get_tfh_by_index(gts->ts_gru, cbrnum);
				515	ret = gru_user_dropin(gts, tfh, cbp);
				516	}
				517	exit:
				518	gru_unlock_gts(gts);
				519	return ret;
				520	}
				521
				522	/*
				523	* Fetch the exception detail information for a CB that terminated with
				524	* an exception.
				525	*/
				526	int gru_get_exception_detail(unsigned long arg)
				527	{
				528	struct control_block_extended_exc_detail excdet;
				529	struct gru_control_block_extended *cbe;
				530	struct gru_thread_state *gts;
				531	int ucbnum, cbrnum, ret;
				532
				533	STAT(user_exception);
				534	if (copy_from_user(&excdet, (void __user *)arg, sizeof(excdet)))
				535	return -EFAULT;
				536
				537	gru_dbg(grudev, "address 0x%lx\n", excdet.cb);
				538	gts = gru_find_lock_gts(excdet.cb);
				539	if (!gts)
				540	return -EINVAL;
				541
				542	if (gts->ts_gru) {
				543	ucbnum = get_cb_number((void *)excdet.cb);
				544	cbrnum = thread_cbr_number(gts, ucbnum);
				545	cbe = get_cbe_by_index(gts->ts_gru, cbrnum);
				546	excdet.opc = cbe->opccpy;
				547	excdet.exopc = cbe->exopccpy;
				548	excdet.ecause = cbe->ecause;
				549	excdet.exceptdet0 = cbe->idef1upd;
				550	excdet.exceptdet1 = cbe->idef3upd;
				551	ret = 0;
				552	} else {
				553	ret = -EAGAIN;
				554	}
				555	gru_unlock_gts(gts);
				556
				557	gru_dbg(grudev, "address 0x%lx, ecause 0x%x\n", excdet.cb,
				558	excdet.ecause);
				559	if (!ret && copy_to_user((void __user *)arg, &excdet, sizeof(excdet)))
				560	ret = -EFAULT;
				561	return ret;
				562	}
				563
				564	/*
				565	* User request to unload a context. Content is saved for possible reload.
				566	*/
				567	int gru_user_unload_context(unsigned long arg)
				568	{
				569	struct gru_thread_state *gts;
				570	struct gru_unload_context_req req;
				571
				572	STAT(user_unload_context);
				573	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
				574	return -EFAULT;
				575
				576	gru_dbg(grudev, "gseg 0x%lx\n", req.gseg);
				577
				578	gts = gru_find_lock_gts(req.gseg);
				579	if (!gts)
				580	return -EINVAL;
				581
				582	if (gts->ts_gru)
				583	gru_unload_context(gts, 1);
				584	gru_unlock_gts(gts);
				585
				586	return 0;
				587	}
				588
				589	/*
				590	* User request to flush a range of virtual addresses from the GRU TLB
				591	* (Mainly for testing).
				592	*/
				593	int gru_user_flush_tlb(unsigned long arg)
				594	{
				595	struct gru_thread_state *gts;
				596	struct gru_flush_tlb_req req;
				597
				598	STAT(user_flush_tlb);
				599	if (copy_from_user(&req, (void __user *)arg, sizeof(req)))
				600	return -EFAULT;
				601
				602	gru_dbg(grudev, "gseg 0x%lx, vaddr 0x%lx, len 0x%lx\n", req.gseg,
				603	req.vaddr, req.len);
				604
				605	gts = gru_find_lock_gts(req.gseg);
				606	if (!gts)
				607	return -EINVAL;
				608
				609	gru_flush_tlb_range(gts->ts_gms, req.vaddr, req.vaddr + req.len);
				610	gru_unlock_gts(gts);
				611
				612	return 0;
				613	}
				614
				615	/*
				616	* Register the current task as the user of the GSEG slice.
				617	* Needed for TLB fault interrupt targeting.
				618	*/
				619	int gru_set_task_slice(long address)
				620	{
				621	struct gru_thread_state *gts;
				622
				623	STAT(set_task_slice);
				624	gru_dbg(grudev, "address 0x%lx\n", address);
				625	gts = gru_alloc_locked_gts(address);
				626	if (!gts)
				627	return -EINVAL;
				628
				629	gts->ts_tgid_owner = current->tgid;
				630	gru_unlock_gts(gts);
				631
				632	return 0;
				633	}