Blame - drivers/staging/rdma/hfi1/user_exp_rcv.c - kernel/msm-4.9

blob: 79612a2bd07d3d7afc5656636f4bb740e22c74d3 [file] [log] [blame]

Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	1	/*
				2	*
				3	* This file is provided under a dual BSD/GPLv2 license. When using or
				4	* redistributing this file, you may do so under either license.
				5	*
				6	* GPL LICENSE SUMMARY
				7	*
				8	* Copyright(c) 2015 Intel Corporation.
				9	*
				10	* This program is free software; you can redistribute it and/or modify
				11	* it under the terms of version 2 of the GNU General Public License as
				12	* published by the Free Software Foundation.
				13	*
				14	* This program is distributed in the hope that it will be useful, but
				15	* WITHOUT ANY WARRANTY; without even the implied warranty of
				16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				17	* General Public License for more details.
				18	*
				19	* BSD LICENSE
				20	*
				21	* Copyright(c) 2015 Intel Corporation.
				22	*
				23	* Redistribution and use in source and binary forms, with or without
				24	* modification, are permitted provided that the following conditions
				25	* are met:
				26	*
				27	* - Redistributions of source code must retain the above copyright
				28	* notice, this list of conditions and the following disclaimer.
				29	* - Redistributions in binary form must reproduce the above copyright
				30	* notice, this list of conditions and the following disclaimer in
				31	* the documentation and/or other materials provided with the
				32	* distribution.
				33	* - Neither the name of Intel Corporation nor the names of its
				34	* contributors may be used to endorse or promote products derived
				35	* from this software without specific prior written permission.
				36	*
				37	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				38	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				39	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				40	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				41	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				42	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				43	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				44	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				45	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				46	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				47	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				48	*
				49	*/
				50	#include <asm/page.h>
				51
				52	#include "user_exp_rcv.h"
				53	#include "trace.h"
				54
Mitko Haralanov	b8abe34	2016-02-05 11:57:51 -0500	[diff] [blame]	55	struct tid_group {
				56	struct list_head list;
				57	unsigned base;
				58	u8 size;
				59	u8 used;
				60	u8 map;
				61	};
				62
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	63	struct mmu_rb_node {
				64	struct rb_node rbnode;
				65	unsigned long virt;
				66	unsigned long phys;
				67	unsigned long len;
				68	struct tid_group *grp;
				69	u32 rcventry;
				70	dma_addr_t dma_addr;
				71	bool freed;
				72	unsigned npages;
				73	struct page *pages[0];
				74	};
				75
				76	enum mmu_call_types {
				77	MMU_INVALIDATE_PAGE = 0,
				78	MMU_INVALIDATE_RANGE = 1
				79	};
				80
				81	static const char * const mmu_types[] = {
				82	"PAGE",
				83	"RANGE"
				84	};
				85
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	86	struct tid_pageset {
				87	u16 idx;
				88	u16 count;
				89	};
				90
Mitko Haralanov	b8abe34	2016-02-05 11:57:51 -0500	[diff] [blame]	91	#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
				92
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	93	#define num_user_pages(vaddr, len) \
				94	(1 + (((((unsigned long)(vaddr) + \
				95	(unsigned long)(len) - 1) & PAGE_MASK) - \
				96	((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
				97
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	98	static void unlock_exp_tids(struct hfi1_ctxtdata , struct exp_tid_set ,
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	99	struct rb_root *);
Mitko Haralanov	7e7a436e	2016-02-05 11:57:57 -0500	[diff] [blame]	100	static u32 find_phys_blocks(struct page *, unsigned, struct tid_pageset );
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	101	static int set_rcvarray_entry(struct file *, unsigned long, u32,
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	102	struct tid_group , struct page *, unsigned);
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	103	static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long,
				104	unsigned long);
				105	static struct mmu_rb_node mmu_rb_search_by_addr(struct rb_root ,
Mitko Haralanov	b5eb3b2	2016-02-05 11:57:55 -0500	[diff] [blame]	106	unsigned long);
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	107	static inline struct mmu_rb_node mmu_rb_search_by_entry(struct rb_root ,
				108	u32);
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	109	static int mmu_rb_insert_by_addr(struct rb_root , struct mmu_rb_node );
				110	static int mmu_rb_insert_by_entry(struct rb_root , struct mmu_rb_node );
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	111	static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
				112	unsigned long, unsigned long,
				113	enum mmu_call_types);
				114	static inline void mmu_notifier_page(struct mmu_notifier , struct mm_struct ,
				115	unsigned long);
				116	static inline void mmu_notifier_range_start(struct mmu_notifier *,
				117	struct mm_struct *,
				118	unsigned long, unsigned long);
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	119	static int program_rcvarray(struct file , unsigned long, struct tid_group ,
				120	struct tid_pageset , unsigned, u16, struct page *,
Mitko Haralanov	7e7a436e	2016-02-05 11:57:57 -0500	[diff] [blame]	121	u32 , unsigned , unsigned *);
Mitko Haralanov	455d7f1	2016-02-05 11:57:56 -0500	[diff] [blame]	122	static int unprogram_rcvarray(struct file , u32, struct tid_group *);
				123	static void clear_tid_node(struct hfi1_filedata , u16, struct mmu_rb_node );
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	124
				125	static inline u32 rcventry2tidinfo(u32 rcventry)
				126	{
				127	u32 pair = rcventry & ~0x1;
				128
				129	return EXP_TID_SET(IDX, pair >> 1) \|
				130	EXP_TID_SET(CTRL, 1 << (rcventry - pair));
				131	}
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	132
Mitko Haralanov	b8abe34	2016-02-05 11:57:51 -0500	[diff] [blame]	133	static inline void exp_tid_group_init(struct exp_tid_set *set)
				134	{
				135	INIT_LIST_HEAD(&set->list);
				136	set->count = 0;
				137	}
				138
				139	static inline void tid_group_remove(struct tid_group *grp,
				140	struct exp_tid_set *set)
				141	{
				142	list_del_init(&grp->list);
				143	set->count--;
				144	}
				145
				146	static inline void tid_group_add_tail(struct tid_group *grp,
				147	struct exp_tid_set *set)
				148	{
				149	list_add_tail(&grp->list, &set->list);
				150	set->count++;
				151	}
				152
				153	static inline struct tid_group tid_group_pop(struct exp_tid_set set)
				154	{
				155	struct tid_group *grp =
				156	list_first_entry(&set->list, struct tid_group, list);
				157	list_del_init(&grp->list);
				158	set->count--;
				159	return grp;
				160	}
				161
				162	static inline void tid_group_move(struct tid_group *group,
				163	struct exp_tid_set *s1,
				164	struct exp_tid_set *s2)
				165	{
				166	tid_group_remove(group, s1);
				167	tid_group_add_tail(group, s2);
				168	}
				169
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	170	static struct mmu_notifier_ops mn_opts = {
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	171	.invalidate_page = mmu_notifier_page,
				172	.invalidate_range_start = mmu_notifier_range_start,
				173	};
				174
				175	/*
				176	* Initialize context and file private data needed for Expected
				177	* receive caching. This needs to be done after the context has
				178	* been configured with the eager/expected RcvEntry counts.
				179	*/
				180	int hfi1_user_exp_rcv_init(struct file *fp)
				181	{
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	182	struct hfi1_filedata *fd = fp->private_data;
				183	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				184	struct hfi1_devdata *dd = uctxt->dd;
				185	unsigned tidbase;
				186	int i, ret = 0;
				187
				188	INIT_HLIST_NODE(&fd->mn.hlist);
				189	spin_lock_init(&fd->rb_lock);
				190	spin_lock_init(&fd->tid_lock);
				191	spin_lock_init(&fd->invalid_lock);
				192	fd->mn.ops = &mn_opts;
				193	fd->tid_rb_root = RB_ROOT;
				194
				195	if (!uctxt->subctxt_cnt \|\| !fd->subctxt) {
				196	exp_tid_group_init(&uctxt->tid_group_list);
				197	exp_tid_group_init(&uctxt->tid_used_list);
				198	exp_tid_group_init(&uctxt->tid_full_list);
				199
				200	tidbase = uctxt->expected_base;
				201	for (i = 0; i < uctxt->expected_count /
				202	dd->rcv_entries.group_size; i++) {
				203	struct tid_group *grp;
				204
				205	grp = kzalloc(sizeof(*grp), GFP_KERNEL);
				206	if (!grp) {
				207	/*
				208	* If we fail here, the groups already
				209	* allocated will be freed by the close
				210	* call.
				211	*/
				212	ret = -ENOMEM;
				213	goto done;
				214	}
				215	grp->size = dd->rcv_entries.group_size;
				216	grp->base = tidbase;
				217	tid_group_add_tail(grp, &uctxt->tid_group_list);
				218	tidbase += dd->rcv_entries.group_size;
				219	}
				220	}
				221
				222	if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
				223	fd->invalid_tid_idx = 0;
				224	fd->invalid_tids = kzalloc(uctxt->expected_count *
				225	sizeof(u32), GFP_KERNEL);
				226	if (!fd->invalid_tids) {
				227	ret = -ENOMEM;
				228	goto done;
				229	} else {
				230	/*
				231	* Register MMU notifier callbacks. If the registration
				232	* fails, continue but turn off the TID caching for
				233	* all user contexts.
				234	*/
				235	ret = mmu_notifier_register(&fd->mn, current->mm);
				236	if (ret) {
				237	dd_dev_info(dd,
				238	"Failed MMU notifier registration %d\n",
				239	ret);
				240	HFI1_CAP_USET(TID_UNMAP);
				241	ret = 0;
				242	}
				243	}
				244	}
				245
				246	if (HFI1_CAP_IS_USET(TID_UNMAP))
				247	fd->mmu_rb_insert = mmu_rb_insert_by_entry;
				248	else
				249	fd->mmu_rb_insert = mmu_rb_insert_by_addr;
				250
				251	/*
				252	* PSM does not have a good way to separate, count, and
				253	* effectively enforce a limit on RcvArray entries used by
				254	* subctxts (when context sharing is used) when TID caching
				255	* is enabled. To help with that, we calculate a per-process
				256	* RcvArray entry share and enforce that.
				257	* If TID caching is not in use, PSM deals with usage on its
				258	* own. In that case, we allow any subctxt to take all of the
				259	* entries.
				260	*
				261	* Make sure that we set the tid counts only after successful
				262	* init.
				263	*/
Mitko Haralanov	455d7f1	2016-02-05 11:57:56 -0500	[diff] [blame]	264	spin_lock(&fd->tid_lock);
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	265	if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
				266	u16 remainder;
				267
				268	fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
				269	remainder = uctxt->expected_count % uctxt->subctxt_cnt;
				270	if (remainder && fd->subctxt < remainder)
				271	fd->tid_limit++;
				272	} else {
				273	fd->tid_limit = uctxt->expected_count;
				274	}
Mitko Haralanov	455d7f1	2016-02-05 11:57:56 -0500	[diff] [blame]	275	spin_unlock(&fd->tid_lock);
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	276	done:
				277	return ret;
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	278	}
				279
				280	int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
				281	{
Mitko Haralanov	3abb33a	2016-02-05 11:57:54 -0500	[diff] [blame]	282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				283	struct tid_group grp, gptr;
				284
				285	/*
				286	* The notifier would have been removed when the process'es mm
				287	* was freed.
				288	*/
				289	if (current->mm && !HFI1_CAP_IS_USET(TID_UNMAP))
				290	mmu_notifier_unregister(&fd->mn, current->mm);
				291
				292	kfree(fd->invalid_tids);
				293
				294	if (!uctxt->cnt) {
				295	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
				296	unlock_exp_tids(uctxt, &uctxt->tid_full_list,
				297	&fd->tid_rb_root);
				298	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
				299	unlock_exp_tids(uctxt, &uctxt->tid_used_list,
				300	&fd->tid_rb_root);
				301	list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
				302	list) {
				303	list_del_init(&grp->list);
				304	kfree(grp);
				305	}
				306	spin_lock(&fd->rb_lock);
				307	if (!RB_EMPTY_ROOT(&fd->tid_rb_root)) {
				308	struct rb_node *node;
				309	struct mmu_rb_node *rbnode;
				310
				311	while ((node = rb_first(&fd->tid_rb_root))) {
				312	rbnode = rb_entry(node, struct mmu_rb_node,
				313	rbnode);
				314	rb_erase(&rbnode->rbnode, &fd->tid_rb_root);
				315	kfree(rbnode);
				316	}
				317	}
				318	spin_unlock(&fd->rb_lock);
				319	hfi1_clear_tids(uctxt);
				320	}
				321	return 0;
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	322	}
				323
Mitko Haralanov	b8abe34	2016-02-05 11:57:51 -0500	[diff] [blame]	324	/*
				325	* Write an "empty" RcvArray entry.
				326	* This function exists so the TID registaration code can use it
				327	* to write to unused/unneeded entries and still take advantage
				328	* of the WC performance improvements. The HFI will ignore this
				329	* write to the RcvArray entry.
				330	*/
				331	static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
				332	{
				333	/*
				334	* Doing the WC fill writes only makes sense if the device is
				335	* present and the RcvArray has been mapped as WC memory.
				336	*/
				337	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
				338	writeq(0, dd->rcvarray_wc + (index * 8));
				339	}
				340
Mitko Haralanov	7e7a436e	2016-02-05 11:57:57 -0500	[diff] [blame]	341	/*
				342	* RcvArray entry allocation for Expected Receives is done by the
				343	* following algorithm:
				344	*
				345	* The context keeps 3 lists of groups of RcvArray entries:
				346	* 1. List of empty groups - tid_group_list
				347	* This list is created during user context creation and
				348	* contains elements which describe sets (of 8) of empty
				349	* RcvArray entries.
				350	* 2. List of partially used groups - tid_used_list
				351	* This list contains sets of RcvArray entries which are
				352	* not completely used up. Another mapping request could
				353	* use some of all of the remaining entries.
				354	* 3. List of full groups - tid_full_list
				355	* This is the list where sets that are completely used
				356	* up go.
				357	*
				358	* An attempt to optimize the usage of RcvArray entries is
				359	* made by finding all sets of physically contiguous pages in a
				360	* user's buffer.
				361	* These physically contiguous sets are further split into
				362	* sizes supported by the receive engine of the HFI. The
				363	* resulting sets of pages are stored in struct tid_pageset,
				364	* which describes the sets as:
				365	* * .count - number of pages in this set
				366	* * .idx - starting index into struct page ** array
				367	* of this set
				368	*
				369	* From this point on, the algorithm deals with the page sets
				370	* described above. The number of pagesets is divided by the
				371	* RcvArray group size to produce the number of full groups
				372	* needed.
				373	*
				374	* Groups from the 3 lists are manipulated using the following
				375	* rules:
				376	* 1. For each set of 8 pagesets, a complete group from
				377	* tid_group_list is taken, programmed, and moved to
				378	* the tid_full_list list.
				379	* 2. For all remaining pagesets:
				380	* 2.1 If the tid_used_list is empty and the tid_group_list
				381	* is empty, stop processing pageset and return only
				382	* what has been programmed up to this point.
				383	* 2.2 If the tid_used_list is empty and the tid_group_list
				384	* is not empty, move a group from tid_group_list to
				385	* tid_used_list.
				386	* 2.3 For each group is tid_used_group, program as much as
				387	* can fit into the group. If the group becomes fully
				388	* used, move it to tid_full_list.
				389	*/
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	390	int hfi1_user_exp_rcv_setup(struct file fp, struct hfi1_tid_info tinfo)
				391	{
Mitko Haralanov	7e7a436e	2016-02-05 11:57:57 -0500	[diff] [blame]	392	int ret = 0, need_group = 0, pinned;
				393	struct hfi1_filedata *fd = fp->private_data;
				394	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				395	struct hfi1_devdata *dd = uctxt->dd;
				396	unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
				397	tididx = 0, mapped, mapped_pages = 0;
				398	unsigned long vaddr = tinfo->vaddr;
				399	struct page **pages = NULL;
				400	u32 *tidlist = NULL;
				401	struct tid_pageset *pagesets = NULL;
				402
				403	/* Get the number of pages the user buffer spans */
				404	npages = num_user_pages(vaddr, tinfo->length);
				405	if (!npages)
				406	return -EINVAL;
				407
				408	if (npages > uctxt->expected_count) {
				409	dd_dev_err(dd, "Expected buffer too big\n");
				410	return -EINVAL;
				411	}
				412
				413	/* Verify that access is OK for the user buffer */
				414	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
				415	npages * PAGE_SIZE)) {
				416	dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
				417	(void *)vaddr, npages);
				418	return -EFAULT;
				419	}
				420
				421	pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
				422	GFP_KERNEL);
				423	if (!pagesets)
				424	return -ENOMEM;
				425
				426	/* Allocate the array of struct page pointers needed for pinning */
				427	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
				428	if (!pages) {
				429	ret = -ENOMEM;
				430	goto bail;
				431	}
				432
				433	/*
				434	* Pin all the pages of the user buffer. If we can't pin all the
				435	* pages, accept the amount pinned so far and program only that.
				436	* User space knows how to deal with partially programmed buffers.
				437	*/
				438	pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
				439	if (pinned <= 0) {
				440	ret = pinned;
				441	goto bail;
				442	}
				443
				444	/* Find sets of physically contiguous pages */
				445	npagesets = find_phys_blocks(pages, pinned, pagesets);
				446
				447	/*
				448	* We don't need to access this under a lock since tid_used is per
				449	* process and the same process cannot be in hfi1_user_exp_rcv_clear()
				450	* and hfi1_user_exp_rcv_setup() at the same time.
				451	*/
				452	spin_lock(&fd->tid_lock);
				453	if (fd->tid_used + npagesets > fd->tid_limit)
				454	pageset_count = fd->tid_limit - fd->tid_used;
				455	else
				456	pageset_count = npagesets;
				457	spin_unlock(&fd->tid_lock);
				458
				459	if (!pageset_count)
				460	goto bail;
				461
				462	ngroups = pageset_count / dd->rcv_entries.group_size;
				463	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
				464	if (!tidlist) {
				465	ret = -ENOMEM;
				466	goto nomem;
				467	}
				468
				469	tididx = 0;
				470
				471	/*
				472	* From this point on, we are going to be using shared (between master
				473	* and subcontexts) context resources. We need to take the lock.
				474	*/
				475	mutex_lock(&uctxt->exp_lock);
				476	/*
				477	* The first step is to program the RcvArray entries which are complete
				478	* groups.
				479	*/
				480	while (ngroups && uctxt->tid_group_list.count) {
				481	struct tid_group *grp =
				482	tid_group_pop(&uctxt->tid_group_list);
				483
				484	ret = program_rcvarray(fp, vaddr, grp, pagesets,
				485	pageidx, dd->rcv_entries.group_size,
				486	pages, tidlist, &tididx, &mapped);
				487	/*
				488	* If there was a failure to program the RcvArray
				489	* entries for the entire group, reset the grp fields
				490	* and add the grp back to the free group list.
				491	*/
				492	if (ret <= 0) {
				493	tid_group_add_tail(grp, &uctxt->tid_group_list);
				494	hfi1_cdbg(TID,
				495	"Failed to program RcvArray group %d", ret);
				496	goto unlock;
				497	}
				498
				499	tid_group_add_tail(grp, &uctxt->tid_full_list);
				500	ngroups--;
				501	pageidx += ret;
				502	mapped_pages += mapped;
				503	}
				504
				505	while (pageidx < pageset_count) {
				506	struct tid_group grp, ptr;
				507	/*
				508	* If we don't have any partially used tid groups, check
				509	* if we have empty groups. If so, take one from there and
				510	* put in the partially used list.
				511	*/
				512	if (!uctxt->tid_used_list.count \|\| need_group) {
				513	if (!uctxt->tid_group_list.count)
				514	goto unlock;
				515
				516	grp = tid_group_pop(&uctxt->tid_group_list);
				517	tid_group_add_tail(grp, &uctxt->tid_used_list);
				518	need_group = 0;
				519	}
				520	/*
				521	* There is an optimization opportunity here - instead of
				522	* fitting as many page sets as we can, check for a group
				523	* later on in the list that could fit all of them.
				524	*/
				525	list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
				526	list) {
				527	unsigned use = min_t(unsigned, pageset_count - pageidx,
				528	grp->size - grp->used);
				529
				530	ret = program_rcvarray(fp, vaddr, grp, pagesets,
				531	pageidx, use, pages, tidlist,
				532	&tididx, &mapped);
				533	if (ret < 0) {
				534	hfi1_cdbg(TID,
				535	"Failed to program RcvArray entries %d",
				536	ret);
				537	ret = -EFAULT;
				538	goto unlock;
				539	} else if (ret > 0) {
				540	if (grp->used == grp->size)
				541	tid_group_move(grp,
				542	&uctxt->tid_used_list,
				543	&uctxt->tid_full_list);
				544	pageidx += ret;
				545	mapped_pages += mapped;
				546	need_group = 0;
				547	/* Check if we are done so we break out early */
				548	if (pageidx >= pageset_count)
				549	break;
				550	} else if (WARN_ON(ret == 0)) {
				551	/*
				552	* If ret is 0, we did not program any entries
				553	* into this group, which can only happen if
				554	* we've screwed up the accounting somewhere.
				555	* Warn and try to continue.
				556	*/
				557	need_group = 1;
				558	}
				559	}
				560	}
				561	unlock:
				562	mutex_unlock(&uctxt->exp_lock);
				563	nomem:
				564	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
				565	mapped_pages, ret);
				566	if (tididx) {
				567	spin_lock(&fd->tid_lock);
				568	fd->tid_used += tididx;
				569	spin_unlock(&fd->tid_lock);
				570	tinfo->tidcnt = tididx;
				571	tinfo->length = mapped_pages * PAGE_SIZE;
				572
				573	if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
				574	tidlist, sizeof(tidlist[0]) * tididx)) {
				575	/*
				576	* On failure to copy to the user level, we need to undo
				577	* everything done so far so we don't leak resources.
				578	*/
				579	tinfo->tidlist = (unsigned long)&tidlist;
				580	hfi1_user_exp_rcv_clear(fp, tinfo);
				581	tinfo->tidlist = 0;
				582	ret = -EFAULT;
				583	goto bail;
				584	}
				585	}
				586
				587	/*
				588	* If not everything was mapped (due to insufficient RcvArray entries,
				589	* for example), unpin all unmapped pages so we can pin them nex time.
				590	*/
				591	if (mapped_pages != pinned)
				592	hfi1_release_user_pages(&pages[mapped_pages],
				593	pinned - mapped_pages,
				594	false);
				595	bail:
				596	kfree(pagesets);
				597	kfree(pages);
				598	kfree(tidlist);
				599	return ret > 0 ? 0 : ret;
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	600	}
				601
				602	int hfi1_user_exp_rcv_clear(struct file fp, struct hfi1_tid_info tinfo)
				603	{
Mitko Haralanov	455d7f1	2016-02-05 11:57:56 -0500	[diff] [blame]	604	int ret = 0;
				605	struct hfi1_filedata *fd = fp->private_data;
				606	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				607	u32 *tidinfo;
				608	unsigned tididx;
				609
				610	tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
				611	if (!tidinfo)
				612	return -ENOMEM;
				613
				614	if (copy_from_user(tidinfo, (void __user *)(unsigned long)
				615	tinfo->tidlist, sizeof(tidinfo[0]) *
				616	tinfo->tidcnt)) {
				617	ret = -EFAULT;
				618	goto done;
				619	}
				620
				621	mutex_lock(&uctxt->exp_lock);
				622	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
				623	ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
				624	if (ret) {
				625	hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
				626	ret);
				627	break;
				628	}
				629	}
				630	spin_lock(&fd->tid_lock);
				631	fd->tid_used -= tididx;
				632	spin_unlock(&fd->tid_lock);
				633	tinfo->tidcnt = tididx;
				634	mutex_unlock(&uctxt->exp_lock);
				635	done:
				636	kfree(tidinfo);
				637	return ret;
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	638	}
				639
				640	int hfi1_user_exp_rcv_invalid(struct file fp, struct hfi1_tid_info tinfo)
				641	{
Mitko Haralanov	455d7f1	2016-02-05 11:57:56 -0500	[diff] [blame]	642	struct hfi1_filedata *fd = fp->private_data;
				643	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				644	unsigned long *ev = uctxt->dd->events +
				645	(((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
				646	HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
				647	u32 *array;
				648	int ret = 0;
				649
				650	if (!fd->invalid_tids)
				651	return -EINVAL;
				652
				653	/*
				654	* copy_to_user() can sleep, which will leave the invalid_lock
				655	* locked and cause the MMU notifier to be blocked on the lock
				656	* for a long time.
				657	* Copy the data to a local buffer so we can release the lock.
				658	*/
				659	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
				660	if (!array)
				661	return -EFAULT;
				662
				663	spin_lock(&fd->invalid_lock);
				664	if (fd->invalid_tid_idx) {
				665	memcpy(array, fd->invalid_tids, sizeof(array)
				666	fd->invalid_tid_idx);
				667	memset(fd->invalid_tids, 0, sizeof(fd->invalid_tids)
				668	fd->invalid_tid_idx);
				669	tinfo->tidcnt = fd->invalid_tid_idx;
				670	fd->invalid_tid_idx = 0;
				671	/*
				672	* Reset the user flag while still holding the lock.
				673	* Otherwise, PSM can miss events.
				674	*/
				675	clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
				676	} else {
				677	tinfo->tidcnt = 0;
				678	}
				679	spin_unlock(&fd->invalid_lock);
				680
				681	if (tinfo->tidcnt) {
				682	if (copy_to_user((void __user *)tinfo->tidlist,
				683	array, sizeof(array) tinfo->tidcnt))
				684	ret = -EFAULT;
				685	}
				686	kfree(array);
				687
				688	return ret;
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	689	}
				690
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	691	static u32 find_phys_blocks(struct page **pages, unsigned npages,
				692	struct tid_pageset *list)
				693	{
				694	unsigned pagecount, pageidx, setcount = 0, i;
				695	unsigned long pfn, this_pfn;
				696
				697	if (!npages)
				698	return 0;
				699
				700	/*
				701	* Look for sets of physically contiguous pages in the user buffer.
				702	* This will allow us to optimize Expected RcvArray entry usage by
				703	* using the bigger supported sizes.
				704	*/
				705	pfn = page_to_pfn(pages[0]);
				706	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
				707	this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
				708
				709	/*
				710	* If the pfn's are not sequential, pages are not physically
				711	* contiguous.
				712	*/
				713	if (this_pfn != ++pfn) {
				714	/*
				715	* At this point we have to loop over the set of
				716	* physically contiguous pages and break them down it
				717	* sizes supported by the HW.
				718	* There are two main constraints:
				719	* 1. The max buffer size is MAX_EXPECTED_BUFFER.
				720	* If the total set size is bigger than that
				721	* program only a MAX_EXPECTED_BUFFER chunk.
				722	* 2. The buffer size has to be a power of two. If
				723	* it is not, round down to the closes power of
				724	* 2 and program that size.
				725	*/
				726	while (pagecount) {
				727	int maxpages = pagecount;
				728	u32 bufsize = pagecount * PAGE_SIZE;
				729
				730	if (bufsize > MAX_EXPECTED_BUFFER)
				731	maxpages =
				732	MAX_EXPECTED_BUFFER >>
				733	PAGE_SHIFT;
				734	else if (!is_power_of_2(bufsize))
				735	maxpages =
				736	rounddown_pow_of_two(bufsize) >>
				737	PAGE_SHIFT;
				738
				739	list[setcount].idx = pageidx;
				740	list[setcount].count = maxpages;
				741	pagecount -= maxpages;
				742	pageidx += maxpages;
				743	setcount++;
				744	}
				745	pageidx = i;
				746	pagecount = 1;
				747	pfn = this_pfn;
				748	} else {
				749	pagecount++;
				750	}
				751	}
				752	return setcount;
				753	}
				754
				755	/**
				756	* program_rcvarray() - program an RcvArray group with receive buffers
				757	* @fp: file pointer
				758	* @vaddr: starting user virtual address
				759	* @grp: RcvArray group
				760	* @sets: array of struct tid_pageset holding information on physically
				761	* contiguous chunks from the user buffer
				762	* @start: starting index into sets array
				763	* @count: number of struct tid_pageset's to program
				764	* @pages: an array of struct page * for the user buffer
				765	* @tidlist: the array of u32 elements when the information about the
				766	* programmed RcvArray entries is to be encoded.
				767	* @tididx: starting offset into tidlist
				768	* @pmapped: (output parameter) number of pages programmed into the RcvArray
				769	* entries.
				770	*
				771	* This function will program up to 'count' number of RcvArray entries from the
				772	* group 'grp'. To make best use of write-combining writes, the function will
				773	* perform writes to the unused RcvArray entries which will be ignored by the
				774	* HW. Each RcvArray entry will be programmed with a physically contiguous
				775	* buffer chunk from the user's virtual buffer.
				776	*
				777	* Return:
				778	* -EINVAL if the requested count is larger than the size of the group,
				779	* -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
				780	* number of RcvArray entries programmed.
				781	*/
				782	static int program_rcvarray(struct file *fp, unsigned long vaddr,
				783	struct tid_group *grp,
				784	struct tid_pageset *sets,
				785	unsigned start, u16 count, struct page **pages,
				786	u32 tidlist, unsigned tididx, unsigned *pmapped)
				787	{
				788	struct hfi1_filedata *fd = fp->private_data;
				789	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				790	struct hfi1_devdata *dd = uctxt->dd;
				791	u16 idx;
				792	u32 tidinfo = 0, rcventry, useidx = 0;
				793	int mapped = 0;
				794
				795	/* Count should never be larger than the group size */
				796	if (count > grp->size)
				797	return -EINVAL;
				798
				799	/* Find the first unused entry in the group */
				800	for (idx = 0; idx < grp->size; idx++) {
				801	if (!(grp->map & (1 << idx))) {
				802	useidx = idx;
				803	break;
				804	}
				805	rcv_array_wc_fill(dd, grp->base + idx);
				806	}
				807
				808	idx = 0;
				809	while (idx < count) {
				810	u16 npages, pageidx, setidx = start + idx;
				811	int ret = 0;
				812
				813	/*
				814	* If this entry in the group is used, move to the next one.
				815	* If we go past the end of the group, exit the loop.
				816	*/
				817	if (useidx >= grp->size) {
				818	break;
				819	} else if (grp->map & (1 << useidx)) {
				820	rcv_array_wc_fill(dd, grp->base + useidx);
				821	useidx++;
				822	continue;
				823	}
				824
				825	rcventry = grp->base + useidx;
				826	npages = sets[setidx].count;
				827	pageidx = sets[setidx].idx;
				828
				829	ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
				830	rcventry, grp, pages + pageidx,
				831	npages);
				832	if (ret)
				833	return ret;
				834	mapped += npages;
				835
				836	tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) \|
				837	EXP_TID_SET(LEN, npages);
				838	tidlist[(*tididx)++] = tidinfo;
				839	grp->used++;
				840	grp->map \|= 1 << useidx++;
				841	idx++;
				842	}
				843
				844	/* Fill the rest of the group with "blank" writes */
				845	for (; useidx < grp->size; useidx++)
				846	rcv_array_wc_fill(dd, grp->base + useidx);
				847	*pmapped = mapped;
				848	return idx;
				849	}
				850
				851	static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
				852	u32 rcventry, struct tid_group *grp,
				853	struct page **pages, unsigned npages)
				854	{
				855	int ret;
				856	struct hfi1_filedata *fd = fp->private_data;
				857	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				858	struct mmu_rb_node *node;
				859	struct hfi1_devdata *dd = uctxt->dd;
				860	struct rb_root *root = &fd->tid_rb_root;
				861	dma_addr_t phys;
				862
				863	/*
				864	* Allocate the node first so we can handle a potential
				865	* failure before we've programmed anything.
				866	*/
				867	node = kzalloc(sizeof(node) + (sizeof(struct page ) * npages),
				868	GFP_KERNEL);
				869	if (!node)
				870	return -ENOMEM;
				871
				872	phys = pci_map_single(dd->pcidev,
				873	__va(page_to_phys(pages[0])),
				874	npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
				875	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
				876	dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
				877	phys);
				878	kfree(node);
				879	return -EFAULT;
				880	}
				881
				882	node->virt = vaddr;
				883	node->phys = page_to_phys(pages[0]);
				884	node->len = npages * PAGE_SIZE;
				885	node->npages = npages;
				886	node->rcventry = rcventry;
				887	node->dma_addr = phys;
				888	node->grp = grp;
				889	node->freed = false;
				890	memcpy(node->pages, pages, sizeof(struct page ) npages);
				891
				892	spin_lock(&fd->rb_lock);
				893	ret = fd->mmu_rb_insert(root, node);
				894	spin_unlock(&fd->rb_lock);
				895
				896	if (ret) {
				897	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
				898	node->rcventry, node->virt, node->phys, ret);
				899	pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
				900	PCI_DMA_FROMDEVICE);
				901	kfree(node);
				902	return -EFAULT;
				903	}
				904	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
Mitko Haralanov	0b091fb	2016-02-05 11:57:58 -0500	[diff] [blame]	905	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry,
				906	npages, node->virt, node->phys, phys);
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	907	return 0;
				908	}
				909
				910	static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
				911	struct tid_group **grp)
				912	{
				913	struct hfi1_filedata *fd = fp->private_data;
				914	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				915	struct hfi1_devdata *dd = uctxt->dd;
				916	struct mmu_rb_node *node;
				917	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
				918	u32 tidbase = uctxt->expected_base,
				919	tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
				920
				921	if (tididx >= uctxt->expected_count) {
				922	dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
				923	tididx, uctxt->ctxt);
				924	return -EINVAL;
				925	}
				926
				927	if (tidctrl == 0x3)
				928	return -EINVAL;
				929
				930	rcventry = tidbase + tididx + (tidctrl - 1);
				931
				932	spin_lock(&fd->rb_lock);
				933	node = mmu_rb_search_by_entry(&fd->tid_rb_root, rcventry);
				934	if (!node) {
				935	spin_unlock(&fd->rb_lock);
				936	return -EBADF;
				937	}
				938	rb_erase(&node->rbnode, &fd->tid_rb_root);
				939	spin_unlock(&fd->rb_lock);
				940	if (grp)
				941	*grp = node->grp;
				942	clear_tid_node(fd, fd->subctxt, node);
				943	return 0;
				944	}
				945
				946	static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
				947	struct mmu_rb_node *node)
				948	{
				949	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				950	struct hfi1_devdata *dd = uctxt->dd;
				951
Mitko Haralanov	0b091fb	2016-02-05 11:57:58 -0500	[diff] [blame]	952	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
				953	node->npages, node->virt, node->phys,
				954	node->dma_addr);
				955
Mitko Haralanov	f88e0c8	2016-02-05 11:57:52 -0500	[diff] [blame]	956	hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
				957	/*
				958	* Make sure device has seen the write before we unpin the
				959	* pages.
				960	*/
				961	flush_wc();
				962
				963	pci_unmap_single(dd->pcidev, node->dma_addr, node->len,
				964	PCI_DMA_FROMDEVICE);
				965	hfi1_release_user_pages(node->pages, node->npages, true);
				966
				967	node->grp->used--;
				968	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
				969
				970	if (node->grp->used == node->grp->size - 1)
				971	tid_group_move(node->grp, &uctxt->tid_full_list,
				972	&uctxt->tid_used_list);
				973	else if (!node->grp->used)
				974	tid_group_move(node->grp, &uctxt->tid_used_list,
				975	&uctxt->tid_group_list);
				976	kfree(node);
				977	}
				978
				979	static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
				980	struct exp_tid_set set, struct rb_root root)
				981	{
				982	struct tid_group grp, ptr;
				983	struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
				984	tid_rb_root);
				985	int i;
				986
				987	list_for_each_entry_safe(grp, ptr, &set->list, list) {
				988	list_del_init(&grp->list);
				989
				990	spin_lock(&fd->rb_lock);
				991	for (i = 0; i < grp->size; i++) {
				992	if (grp->map & (1 << i)) {
				993	u16 rcventry = grp->base + i;
				994	struct mmu_rb_node *node;
				995
				996	node = mmu_rb_search_by_entry(root, rcventry);
				997	if (!node)
				998	continue;
				999	rb_erase(&node->rbnode, root);
				1000	clear_tid_node(fd, -1, node);
				1001	}
				1002	}
				1003	spin_unlock(&fd->rb_lock);
				1004	}
				1005	}
				1006
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	1007	static inline void mmu_notifier_page(struct mmu_notifier *mn,
				1008	struct mm_struct *mm, unsigned long addr)
				1009	{
				1010	mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE,
				1011	MMU_INVALIDATE_PAGE);
				1012	}
				1013
				1014	static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
				1015	struct mm_struct *mm,
				1016	unsigned long start,
				1017	unsigned long end)
				1018	{
				1019	mmu_notifier_mem_invalidate(mn, start, end, MMU_INVALIDATE_RANGE);
				1020	}
				1021
				1022	static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
				1023	unsigned long start, unsigned long end,
				1024	enum mmu_call_types type)
				1025	{
Mitko Haralanov	b5eb3b2	2016-02-05 11:57:55 -0500	[diff] [blame]	1026	struct hfi1_filedata *fd = container_of(mn, struct hfi1_filedata, mn);
				1027	struct hfi1_ctxtdata *uctxt = fd->uctxt;
				1028	struct rb_root *root = &fd->tid_rb_root;
				1029	struct mmu_rb_node *node;
				1030	unsigned long addr = start;
				1031
Mitko Haralanov	0b091fb	2016-02-05 11:57:58 -0500	[diff] [blame]	1032	trace_hfi1_mmu_invalidate(uctxt->ctxt, fd->subctxt, mmu_types[type],
				1033	start, end);
				1034
Mitko Haralanov	b5eb3b2	2016-02-05 11:57:55 -0500	[diff] [blame]	1035	spin_lock(&fd->rb_lock);
				1036	while (addr < end) {
				1037	node = mmu_rb_search_by_addr(root, addr);
				1038
				1039	if (!node) {
				1040	/*
				1041	* Didn't find a node at this address. However, the
				1042	* range could be bigger than what we have registered
				1043	* so we have to keep looking.
				1044	*/
				1045	addr += PAGE_SIZE;
				1046	continue;
				1047	}
				1048
				1049	/*
				1050	* The next address to be looked up is computed based
				1051	* on the node's starting address. This is due to the
				1052	* fact that the range where we start might be in the
				1053	* middle of the node's buffer so simply incrementing
				1054	* the address by the node's size would result is a
				1055	* bad address.
				1056	*/
				1057	addr = node->virt + (node->npages * PAGE_SIZE);
				1058	if (node->freed)
				1059	continue;
				1060
Mitko Haralanov	0b091fb	2016-02-05 11:57:58 -0500	[diff] [blame]	1061	trace_hfi1_exp_tid_inval(uctxt->ctxt, fd->subctxt, node->virt,
				1062	node->rcventry, node->npages,
				1063	node->dma_addr);
Mitko Haralanov	b5eb3b2	2016-02-05 11:57:55 -0500	[diff] [blame]	1064	node->freed = true;
				1065
				1066	spin_lock(&fd->invalid_lock);
				1067	if (fd->invalid_tid_idx < uctxt->expected_count) {
				1068	fd->invalid_tids[fd->invalid_tid_idx] =
				1069	rcventry2tidinfo(node->rcventry -
				1070	uctxt->expected_base);
				1071	fd->invalid_tids[fd->invalid_tid_idx] \|=
				1072	EXP_TID_SET(LEN, node->npages);
				1073	if (!fd->invalid_tid_idx) {
				1074	unsigned long *ev;
				1075
				1076	/*
				1077	* hfi1_set_uevent_bits() sets a user event flag
				1078	* for all processes. Because calling into the
				1079	* driver to process TID cache invalidations is
				1080	* expensive and TID cache invalidations are
				1081	* handled on a per-process basis, we can
				1082	* optimize this to set the flag only for the
				1083	* process in question.
				1084	*/
				1085	ev = uctxt->dd->events +
				1086	(((uctxt->ctxt -
				1087	uctxt->dd->first_user_ctxt) *
				1088	HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
				1089	set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
				1090	}
				1091	fd->invalid_tid_idx++;
				1092	}
				1093	spin_unlock(&fd->invalid_lock);
				1094	}
				1095	spin_unlock(&fd->rb_lock);
Mitko Haralanov	f727a0c	2016-02-05 11:57:46 -0500	[diff] [blame]	1096	}
				1097
				1098	static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr,
				1099	unsigned long len)
				1100	{
				1101	if ((addr + len) <= node->virt)
				1102	return -1;
				1103	else if (addr >= node->virt && addr < (node->virt + node->len))
				1104	return 0;
				1105	else
				1106	return 1;
				1107	}
				1108
				1109	static inline int mmu_entry_cmp(struct mmu_rb_node *node, u32 entry)
				1110	{
				1111	if (entry < node->rcventry)
				1112	return -1;
				1113	else if (entry > node->rcventry)
				1114	return 1;
				1115	else
				1116	return 0;
				1117	}
				1118
				1119	static struct mmu_rb_node mmu_rb_search_by_addr(struct rb_root root,
				1120	unsigned long addr)
				1121	{
				1122	struct rb_node *node = root->rb_node;
				1123
				1124	while (node) {
				1125	struct mmu_rb_node *mnode =
				1126	container_of(node, struct mmu_rb_node, rbnode);
				1127	/*
				1128	* When searching, use at least one page length for size. The
				1129	* MMU notifier will not give us anything less than that. We
				1130	* also don't need anything more than a page because we are
				1131	* guaranteed to have non-overlapping buffers in the tree.
				1132	*/
				1133	int result = mmu_addr_cmp(mnode, addr, PAGE_SIZE);
				1134
				1135	if (result < 0)
				1136	node = node->rb_left;
				1137	else if (result > 0)
				1138	node = node->rb_right;
				1139	else
				1140	return mnode;
				1141	}
				1142	return NULL;
				1143	}
				1144
				1145	static inline struct mmu_rb_node mmu_rb_search_by_entry(struct rb_root root,
				1146	u32 index)
				1147	{
				1148	struct mmu_rb_node *rbnode;
				1149	struct rb_node *node;
				1150
				1151	if (root && !RB_EMPTY_ROOT(root))
				1152	for (node = rb_first(root); node; node = rb_next(node)) {
				1153	rbnode = rb_entry(node, struct mmu_rb_node, rbnode);
				1154	if (rbnode->rcventry == index)
				1155	return rbnode;
				1156	}
				1157	return NULL;
				1158	}
				1159
				1160	static int mmu_rb_insert_by_entry(struct rb_root *root,
				1161	struct mmu_rb_node *node)
				1162	{
				1163	struct rb_node *new = &root->rb_node, parent = NULL;
				1164
				1165	while (*new) {
				1166	struct mmu_rb_node *this =
				1167	container_of(*new, struct mmu_rb_node, rbnode);
				1168	int result = mmu_entry_cmp(this, node->rcventry);
				1169
				1170	parent = *new;
				1171	if (result < 0)
				1172	new = &((*new)->rb_left);
				1173	else if (result > 0)
				1174	new = &((*new)->rb_right);
				1175	else
				1176	return 1;
				1177	}
				1178
				1179	rb_link_node(&node->rbnode, parent, new);
				1180	rb_insert_color(&node->rbnode, root);
				1181	return 0;
				1182	}
				1183
				1184	static int mmu_rb_insert_by_addr(struct rb_root root, struct mmu_rb_node node)
				1185	{
				1186	struct rb_node *new = &root->rb_node, parent = NULL;
				1187
				1188	/* Figure out where to put new node */
				1189	while (*new) {
				1190	struct mmu_rb_node *this =
				1191	container_of(*new, struct mmu_rb_node, rbnode);
				1192	int result = mmu_addr_cmp(this, node->virt, node->len);
				1193
				1194	parent = *new;
				1195	if (result < 0)
				1196	new = &((*new)->rb_left);
				1197	else if (result > 0)
				1198	new = &((*new)->rb_right);
				1199	else
				1200	return 1;
				1201	}
				1202
				1203	/* Add new node and rebalance tree. */
				1204	rb_link_node(&node->rbnode, parent, new);
				1205	rb_insert_color(&node->rbnode, root);
				1206
				1207	return 0;
				1208	}