Blame - drivers/infiniband/hw/mlx5/odp.c - kernel/msm-4.19

blob: 1e73c127feb72f6e8775a5a2be791663a2d1268c [file] [log] [blame]

Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	1	/*
Saeed Mahameed	6cf0a15	2015-04-02 17:07:30 +0300	[diff] [blame]	2	* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	3	*
				4	* This software is available to you under a choice of one of two
				5	* licenses. You may choose to be licensed under the terms of the GNU
				6	* General Public License (GPL) Version 2, available from the file
				7	* COPYING in the main directory of this source tree, or the
				8	* OpenIB.org BSD license below:
				9	*
				10	* Redistribution and use in source and binary forms, with or
				11	* without modification, are permitted provided that the following
				12	* conditions are met:
				13	*
				14	* - Redistributions of source code must retain the above
				15	* copyright notice, this list of conditions and the following
				16	* disclaimer.
				17	*
				18	* - Redistributions in binary form must reproduce the above
				19	* copyright notice, this list of conditions and the following
				20	* disclaimer in the documentation and/or other materials
				21	* provided with the distribution.
				22	*
				23	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				24	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				25	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
				26	* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
				27	* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
				28	* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				29	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
				30	* SOFTWARE.
				31	*/
				32
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	33	#include <rdma/ib_umem.h>
				34	#include <rdma/ib_umem_odp.h>
				35
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	36	#include "mlx5_ib.h"
				37
Haggai Eran	eab668a	2014-12-11 17:04:25 +0200	[diff] [blame]	38	#define MAX_PREFETCH_LEN (410241024U)
				39
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	40	/* Timeout in ms to wait for an active mmu notifier to complete when handling
				41	* a pagefault. */
				42	#define MMU_NOTIFIER_TIMEOUT 1000
				43
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	44	struct workqueue_struct *mlx5_ib_page_fault_wq;
				45
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	46	void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
				47	unsigned long end)
				48	{
				49	struct mlx5_ib_mr *mr;
Artemy Kovalyov	3161625	2017-01-02 11:37:42 +0200	[diff] [blame]	50	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
				51	sizeof(struct mlx5_mtt)) - 1;
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	52	u64 idx = 0, blk_start_idx = 0;
				53	int in_block = 0;
				54	u64 addr;
				55
				56	if (!umem \|\| !umem->odp_data) {
				57	pr_err("invalidation called on NULL umem or non-ODP umem\n");
				58	return;
				59	}
				60
				61	mr = umem->odp_data->private;
				62
				63	if (!mr \|\| !mr->ibmr.pd)
				64	return;
				65
				66	start = max_t(u64, ib_umem_start(umem), start);
				67	end = min_t(u64, ib_umem_end(umem), end);
				68
				69	/*
				70	* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
				71	* while we are doing the invalidation, no page fault will attempt to
				72	* overwrite the same MTTs. Concurent invalidations might race us,
				73	* but they will write 0s as well, so no difference in the end result.
				74	*/
				75
				76	for (addr = start; addr < end; addr += (u64)umem->page_size) {
				77	idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
				78	/*
				79	* Strive to write the MTTs in chunks, but avoid overwriting
				80	* non-existing MTTs. The huristic here can be improved to
				81	* estimate the cost of another UMR vs. the cost of bigger
				82	* UMR.
				83	*/
				84	if (umem->odp_data->dma_list[idx] &
				85	(ODP_READ_ALLOWED_BIT \| ODP_WRITE_ALLOWED_BIT)) {
				86	if (!in_block) {
				87	blk_start_idx = idx;
				88	in_block = 1;
				89	}
				90	} else {
				91	u64 umr_offset = idx & umr_block_mask;
				92
				93	if (in_block && umr_offset == 0) {
				94	mlx5_ib_update_mtt(mr, blk_start_idx,
				95	idx - blk_start_idx, 1);
				96	in_block = 0;
				97	}
				98	}
				99	}
				100	if (in_block)
				101	mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
				102	1);
				103
				104	/*
				105	* We are now sure that the device will not access the
				106	* memory. We can safely unmap it, and mark it as dirty if
				107	* needed.
				108	*/
				109
				110	ib_umem_odp_unmap_dma_pages(umem, start, end);
				111	}
				112
Saeed Mahameed	938fe83	2015-05-28 22:28:41 +0300	[diff] [blame]	113	void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	114	{
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	115	struct ib_odp_caps *caps = &dev->odp_caps;
				116
				117	memset(caps, 0, sizeof(*caps));
				118
Saeed Mahameed	938fe83	2015-05-28 22:28:41 +0300	[diff] [blame]	119	if (!MLX5_CAP_GEN(dev->mdev, pg))
				120	return;
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	121
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	122	caps->general_caps = IB_ODP_SUPPORT;
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	123
Artemy Kovalyov	c438fde	2017-01-02 11:37:43 +0200	[diff] [blame^]	124	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
				125	dev->odp_max_size = U64_MAX;
				126	else
				127	dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
				128
Saeed Mahameed	938fe83	2015-05-28 22:28:41 +0300	[diff] [blame]	129	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
				130	caps->per_transport_caps.ud_odp_caps \|= IB_ODP_SUPPORT_SEND;
				131
				132	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
				133	caps->per_transport_caps.rc_odp_caps \|= IB_ODP_SUPPORT_SEND;
				134
				135	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
				136	caps->per_transport_caps.rc_odp_caps \|= IB_ODP_SUPPORT_RECV;
				137
				138	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
				139	caps->per_transport_caps.rc_odp_caps \|= IB_ODP_SUPPORT_WRITE;
				140
				141	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
				142	caps->per_transport_caps.rc_odp_caps \|= IB_ODP_SUPPORT_READ;
				143
				144	return;
Haggai Eran	8cdd312	2014-12-11 17:04:20 +0200	[diff] [blame]	145	}
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	146
				147	static struct mlx5_ib_mr mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev dev,
				148	u32 key)
				149	{
				150	u32 base_key = mlx5_base_mkey(key);
Matan Barak	a606b0f	2016-02-29 18:05:28 +0200	[diff] [blame]	151	struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key);
				152	struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	153
Matan Barak	a606b0f	2016-02-29 18:05:28 +0200	[diff] [blame]	154	if (!mmkey \|\| mmkey->key != key \|\| !mr->live)
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	155	return NULL;
				156
Matan Barak	a606b0f	2016-02-29 18:05:28 +0200	[diff] [blame]	157	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	158	}
				159
				160	static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
				161	struct mlx5_ib_pfault *pfault,
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	162	int error)
				163	{
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	164	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	165	u32 qpn = qp->trans_qp.base.mqp.qpn;
				166	int ret = mlx5_core_page_fault_resume(dev->mdev,
				167	qpn,
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	168	pfault->mpfault.flags,
				169	error);
				170	if (ret)
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	171	pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn);
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	172	}
				173
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	174	/*
				175	* Handle a single data segment in a page-fault WQE.
				176	*
				177	* Returns number of pages retrieved on success. The caller will continue to
				178	* the next data segment.
				179	* Can return the following error codes:
				180	* -EAGAIN to designate a temporary error. The caller will abort handling the
				181	* page fault and resolve it.
				182	* -EFAULT when there's an error mapping the requested pages. The caller will
				183	* abort the page fault handling and possibly move the QP to an error state.
				184	* On other errors the QP should also be closed with an error.
				185	*/
				186	static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
				187	struct mlx5_ib_pfault *pfault,
				188	u32 key, u64 io_virt, size_t bcnt,
				189	u32 *bytes_mapped)
				190	{
				191	struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
				192	int srcu_key;
				193	unsigned int current_seq;
				194	u64 start_idx;
				195	int npages = 0, ret = 0;
				196	struct mlx5_ib_mr *mr;
				197	u64 access_mask = ODP_READ_ALLOWED_BIT;
				198
				199	srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
				200	mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
				201	/*
				202	* If we didn't find the MR, it means the MR was closed while we were
				203	* handling the ODP event. In this case we return -EFAULT so that the
				204	* QP will be closed.
				205	*/
				206	if (!mr \|\| !mr->ibmr.pd) {
				207	pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
				208	key);
				209	ret = -EFAULT;
				210	goto srcu_unlock;
				211	}
				212	if (!mr->umem->odp_data) {
				213	pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
				214	key);
				215	if (bytes_mapped)
				216	*bytes_mapped +=
				217	(bcnt - pfault->mpfault.bytes_committed);
				218	goto srcu_unlock;
				219	}
				220	if (mr->ibmr.pd != qp->ibqp.pd) {
				221	pr_err("Page-fault with different PDs for QP and MR.\n");
				222	ret = -EFAULT;
				223	goto srcu_unlock;
				224	}
				225
				226	current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	227	/*
				228	* Ensure the sequence number is valid for some time before we call
				229	* gup.
				230	*/
				231	smp_rmb();
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	232
				233	/*
				234	* Avoid branches - this code will perform correctly
				235	* in all iterations (in iteration 2 and above,
				236	* bytes_committed == 0).
				237	*/
				238	io_virt += pfault->mpfault.bytes_committed;
				239	bcnt -= pfault->mpfault.bytes_committed;
				240
Matan Barak	a606b0f	2016-02-29 18:05:28 +0200	[diff] [blame]	241	start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	242
				243	if (mr->umem->writable)
				244	access_mask \|= ODP_WRITE_ALLOWED_BIT;
				245	npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
				246	access_mask, current_seq);
				247	if (npages < 0) {
				248	ret = npages;
				249	goto srcu_unlock;
				250	}
				251
				252	if (npages > 0) {
				253	mutex_lock(&mr->umem->odp_data->umem_mutex);
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	254	if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
				255	/*
				256	* No need to check whether the MTTs really belong to
				257	* this MR, since ib_umem_odp_map_dma_pages already
				258	* checks this.
				259	*/
				260	ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
				261	} else {
				262	ret = -EAGAIN;
				263	}
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	264	mutex_unlock(&mr->umem->odp_data->umem_mutex);
				265	if (ret < 0) {
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	266	if (ret != -EAGAIN)
				267	pr_err("Failed to update mkey page tables\n");
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	268	goto srcu_unlock;
				269	}
				270
				271	if (bytes_mapped) {
				272	u32 new_mappings = npages * PAGE_SIZE -
				273	(io_virt - round_down(io_virt, PAGE_SIZE));
				274	*bytes_mapped += min_t(u32, new_mappings, bcnt);
				275	}
				276	}
				277
				278	srcu_unlock:
Haggai Eran	b4cfe44	2014-12-11 17:04:26 +0200	[diff] [blame]	279	if (ret == -EAGAIN) {
				280	if (!mr->umem->odp_data->dying) {
				281	struct ib_umem_odp *odp_data = mr->umem->odp_data;
				282	unsigned long timeout =
				283	msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
				284
				285	if (!wait_for_completion_timeout(
				286	&odp_data->notifier_completion,
				287	timeout)) {
				288	pr_warn("timeout waiting for mmu notifier completion\n");
				289	}
				290	} else {
				291	/* The MR is being killed, kill the QP as well. */
				292	ret = -EFAULT;
				293	}
				294	}
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	295	srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
				296	pfault->mpfault.bytes_committed = 0;
				297	return ret ? ret : npages;
				298	}
				299
				300	/**
				301	* Parse a series of data segments for page fault handling.
				302	*
				303	* @qp the QP on which the fault occurred.
				304	* @pfault contains page fault information.
				305	* @wqe points at the first data segment in the WQE.
				306	* @wqe_end points after the end of the WQE.
				307	* @bytes_mapped receives the number of bytes that the function was able to
				308	* map. This allows the caller to decide intelligently whether
				309	* enough memory was mapped to resolve the page fault
				310	* successfully (e.g. enough for the next MTU, or the entire
				311	* WQE).
				312	* @total_wqe_bytes receives the total data size of this WQE in bytes (minus
				313	* the committed bytes).
				314	*
				315	* Returns the number of pages loaded if positive, zero for an empty WQE, or a
				316	* negative error code.
				317	*/
				318	static int pagefault_data_segments(struct mlx5_ib_qp *qp,
				319	struct mlx5_ib_pfault pfault, void wqe,
				320	void wqe_end, u32 bytes_mapped,
				321	u32 *total_wqe_bytes, int receive_queue)
				322	{
				323	int ret = 0, npages = 0;
				324	u64 io_virt;
				325	u32 key;
				326	u32 byte_count;
				327	size_t bcnt;
				328	int inline_segment;
				329
				330	/* Skip SRQ next-WQE segment. */
				331	if (receive_queue && qp->ibqp.srq)
				332	wqe += sizeof(struct mlx5_wqe_srq_next_seg);
				333
				334	if (bytes_mapped)
				335	*bytes_mapped = 0;
				336	if (total_wqe_bytes)
				337	*total_wqe_bytes = 0;
				338
				339	while (wqe < wqe_end) {
				340	struct mlx5_wqe_data_seg *dseg = wqe;
				341
				342	io_virt = be64_to_cpu(dseg->addr);
				343	key = be32_to_cpu(dseg->lkey);
				344	byte_count = be32_to_cpu(dseg->byte_count);
				345	inline_segment = !!(byte_count & MLX5_INLINE_SEG);
				346	bcnt = byte_count & ~MLX5_INLINE_SEG;
				347
				348	if (inline_segment) {
				349	bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
				350	wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
				351	16);
				352	} else {
				353	wqe += sizeof(*dseg);
				354	}
				355
				356	/* receive WQE end of sg list. */
				357	if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
				358	io_virt == 0)
				359	break;
				360
				361	if (!inline_segment && total_wqe_bytes) {
				362	*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
				363	pfault->mpfault.bytes_committed);
				364	}
				365
				366	/* A zero length data segment designates a length of 2GB. */
				367	if (bcnt == 0)
				368	bcnt = 1U << 31;
				369
				370	if (inline_segment \|\| bcnt <= pfault->mpfault.bytes_committed) {
				371	pfault->mpfault.bytes_committed -=
				372	min_t(size_t, bcnt,
				373	pfault->mpfault.bytes_committed);
				374	continue;
				375	}
				376
				377	ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
				378	bcnt, bytes_mapped);
				379	if (ret < 0)
				380	break;
				381	npages += ret;
				382	}
				383
				384	return ret < 0 ? ret : npages;
				385	}
				386
				387	/*
				388	* Parse initiator WQE. Advances the wqe pointer to point at the
				389	* scatter-gather list, and set wqe_end to the end of the WQE.
				390	*/
				391	static int mlx5_ib_mr_initiator_pfault_handler(
				392	struct mlx5_ib_qp qp, struct mlx5_ib_pfault pfault,
				393	void wqe, void wqe_end, int wqe_length)
				394	{
				395	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
				396	struct mlx5_wqe_ctrl_seg ctrl = wqe;
				397	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
				398	unsigned ds, opcode;
				399	#if defined(DEBUG)
				400	u32 ctrl_wqe_index, ctrl_qpn;
				401	#endif
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	402	u32 qpn = qp->trans_qp.base.mqp.qpn;
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	403
				404	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
				405	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
				406	mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
				407	ds, wqe_length);
				408	return -EFAULT;
				409	}
				410
				411	if (ds == 0) {
				412	mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	413	wqe_index, qpn);
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	414	return -EFAULT;
				415	}
				416
				417	#if defined(DEBUG)
				418	ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
				419	MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
				420	MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
				421	if (wqe_index != ctrl_wqe_index) {
				422	mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	423	wqe_index, qpn,
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	424	ctrl_wqe_index);
				425	return -EFAULT;
				426	}
				427
				428	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
				429	MLX5_WQE_CTRL_QPN_SHIFT;
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	430	if (qpn != ctrl_qpn) {
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	431	mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	432	wqe_index, qpn,
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	433	ctrl_qpn);
				434	return -EFAULT;
				435	}
				436	#endif /* DEBUG */
				437
				438	wqe_end = wqe + ds * MLX5_WQE_DS_UNITS;
				439	wqe += sizeof(ctrl);
				440
				441	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
				442	MLX5_WQE_CTRL_OPCODE_MASK;
				443	switch (qp->ibqp.qp_type) {
				444	case IB_QPT_RC:
				445	switch (opcode) {
				446	case MLX5_OPCODE_SEND:
				447	case MLX5_OPCODE_SEND_IMM:
				448	case MLX5_OPCODE_SEND_INVAL:
				449	if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
				450	IB_ODP_SUPPORT_SEND))
				451	goto invalid_transport_or_opcode;
				452	break;
				453	case MLX5_OPCODE_RDMA_WRITE:
				454	case MLX5_OPCODE_RDMA_WRITE_IMM:
				455	if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
				456	IB_ODP_SUPPORT_WRITE))
				457	goto invalid_transport_or_opcode;
				458	*wqe += sizeof(struct mlx5_wqe_raddr_seg);
				459	break;
				460	case MLX5_OPCODE_RDMA_READ:
				461	if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
				462	IB_ODP_SUPPORT_READ))
				463	goto invalid_transport_or_opcode;
				464	*wqe += sizeof(struct mlx5_wqe_raddr_seg);
				465	break;
				466	default:
				467	goto invalid_transport_or_opcode;
				468	}
				469	break;
				470	case IB_QPT_UD:
				471	switch (opcode) {
				472	case MLX5_OPCODE_SEND:
				473	case MLX5_OPCODE_SEND_IMM:
				474	if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
				475	IB_ODP_SUPPORT_SEND))
				476	goto invalid_transport_or_opcode;
				477	*wqe += sizeof(struct mlx5_wqe_datagram_seg);
				478	break;
				479	default:
				480	goto invalid_transport_or_opcode;
				481	}
				482	break;
				483	default:
				484	invalid_transport_or_opcode:
				485	mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
				486	qp->ibqp.qp_type, opcode);
				487	return -EFAULT;
				488	}
				489
				490	return 0;
				491	}
				492
				493	/*
				494	* Parse responder WQE. Advances the wqe pointer to point at the
				495	* scatter-gather list, and set wqe_end to the end of the WQE.
				496	*/
				497	static int mlx5_ib_mr_responder_pfault_handler(
				498	struct mlx5_ib_qp qp, struct mlx5_ib_pfault pfault,
				499	void wqe, void wqe_end, int wqe_length)
				500	{
				501	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
				502	struct mlx5_ib_wq *wq = &qp->rq;
				503	int wqe_size = 1 << wq->wqe_shift;
				504
				505	if (qp->ibqp.srq) {
				506	mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
				507	return -EFAULT;
				508	}
				509
				510	if (qp->wq_sig) {
				511	mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
				512	return -EFAULT;
				513	}
				514
				515	if (wqe_size > wqe_length) {
				516	mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
				517	return -EFAULT;
				518	}
				519
				520	switch (qp->ibqp.qp_type) {
				521	case IB_QPT_RC:
				522	if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
				523	IB_ODP_SUPPORT_RECV))
				524	goto invalid_transport_or_opcode;
				525	break;
				526	default:
				527	invalid_transport_or_opcode:
				528	mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
				529	qp->ibqp.qp_type);
				530	return -EFAULT;
				531	}
				532
				533	wqe_end = wqe + wqe_size;
				534
				535	return 0;
				536	}
				537
				538	static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
				539	struct mlx5_ib_pfault *pfault)
				540	{
				541	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
				542	int ret;
				543	void wqe, wqe_end;
				544	u32 bytes_mapped, total_wqe_bytes;
				545	char *buffer = NULL;
				546	int resume_with_error = 0;
				547	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
				548	int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	549	u32 qpn = qp->trans_qp.base.mqp.qpn;
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	550
				551	buffer = (char *)__get_free_page(GFP_KERNEL);
				552	if (!buffer) {
				553	mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
				554	resume_with_error = 1;
				555	goto resolve_page_fault;
				556	}
				557
				558	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	559	PAGE_SIZE, &qp->trans_qp.base);
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	560	if (ret < 0) {
				561	mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	562	-ret, wqe_index, qpn);
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	563	resume_with_error = 1;
				564	goto resolve_page_fault;
				565	}
				566
				567	wqe = buffer;
				568	if (requestor)
				569	ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
				570	&wqe_end, ret);
				571	else
				572	ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
				573	&wqe_end, ret);
				574	if (ret < 0) {
				575	resume_with_error = 1;
				576	goto resolve_page_fault;
				577	}
				578
				579	if (wqe >= wqe_end) {
				580	mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
				581	resume_with_error = 1;
				582	goto resolve_page_fault;
				583	}
				584
				585	ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
				586	&total_wqe_bytes, !requestor);
				587	if (ret == -EAGAIN) {
				588	goto resolve_page_fault;
				589	} else if (ret < 0 \|\| total_wqe_bytes > bytes_mapped) {
				590	mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
				591	-ret);
				592	resume_with_error = 1;
				593	goto resolve_page_fault;
				594	}
				595
				596	resolve_page_fault:
				597	mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
				598	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	599	qpn, resume_with_error,
				600	pfault->mpfault.flags);
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	601
				602	free_page((unsigned long)buffer);
				603	}
				604
Haggai Eran	eab668a	2014-12-11 17:04:25 +0200	[diff] [blame]	605	static int pages_in_range(u64 address, u32 length)
				606	{
				607	return (ALIGN(address + length, PAGE_SIZE) -
				608	(address & PAGE_MASK)) >> PAGE_SHIFT;
				609	}
				610
				611	static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
				612	struct mlx5_ib_pfault *pfault)
				613	{
				614	struct mlx5_pagefault *mpfault = &pfault->mpfault;
				615	u64 address;
				616	u32 length;
				617	u32 prefetch_len = mpfault->bytes_committed;
				618	int prefetch_activated = 0;
				619	u32 rkey = mpfault->rdma.r_key;
				620	int ret;
				621
				622	/* The RDMA responder handler handles the page fault in two parts.
				623	* First it brings the necessary pages for the current packet
				624	* (and uses the pfault context), and then (after resuming the QP)
				625	* prefetches more pages. The second operation cannot use the pfault
				626	* context and therefore uses the dummy_pfault context allocated on
				627	* the stack */
				628	struct mlx5_ib_pfault dummy_pfault = {};
				629
				630	dummy_pfault.mpfault.bytes_committed = 0;
				631
				632	mpfault->rdma.rdma_va += mpfault->bytes_committed;
				633	mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
				634	mpfault->rdma.rdma_op_len);
				635	mpfault->bytes_committed = 0;
				636
				637	address = mpfault->rdma.rdma_va;
				638	length = mpfault->rdma.rdma_op_len;
				639
				640	/* For some operations, the hardware cannot tell the exact message
				641	* length, and in those cases it reports zero. Use prefetch
				642	* logic. */
				643	if (length == 0) {
				644	prefetch_activated = 1;
				645	length = mpfault->rdma.packet_size;
				646	prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
				647	}
				648
				649	ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
				650	NULL);
				651	if (ret == -EAGAIN) {
				652	/* We're racing with an invalidation, don't prefetch */
				653	prefetch_activated = 0;
				654	} else if (ret < 0 \|\| pages_in_range(address, length) > ret) {
				655	mlx5_ib_page_fault_resume(qp, pfault, 1);
				656	return;
				657	}
				658
				659	mlx5_ib_page_fault_resume(qp, pfault, 0);
				660
				661	/* At this point, there might be a new pagefault already arriving in
				662	* the eq, switch to the dummy pagefault for the rest of the
				663	* processing. We're still OK with the objects being alive as the
				664	* work-queue is being fenced. */
				665
				666	if (prefetch_activated) {
				667	ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
				668	address,
				669	prefetch_len,
				670	NULL);
				671	if (ret < 0) {
				672	pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
				673	ret, prefetch_activated,
				674	qp->ibqp.qp_num, address, prefetch_len);
				675	}
				676	}
				677	}
				678
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	679	void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
				680	struct mlx5_ib_pfault *pfault)
				681	{
				682	u8 event_subtype = pfault->mpfault.event_subtype;
				683
				684	switch (event_subtype) {
Haggai Eran	7bdf65d	2014-12-11 17:04:24 +0200	[diff] [blame]	685	case MLX5_PFAULT_SUBTYPE_WQE:
				686	mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
				687	break;
Haggai Eran	eab668a	2014-12-11 17:04:25 +0200	[diff] [blame]	688	case MLX5_PFAULT_SUBTYPE_RDMA:
				689	mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
				690	break;
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	691	default:
				692	pr_warn("Invalid page fault event subtype: 0x%x\n",
				693	event_subtype);
				694	mlx5_ib_page_fault_resume(qp, pfault, 1);
				695	break;
				696	}
				697	}
				698
				699	static void mlx5_ib_qp_pfault_action(struct work_struct *work)
				700	{
				701	struct mlx5_ib_pfault *pfault = container_of(work,
				702	struct mlx5_ib_pfault,
				703	work);
				704	enum mlx5_ib_pagefault_context context =
				705	mlx5_ib_get_pagefault_context(&pfault->mpfault);
				706	struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
				707	pagefaults[context]);
				708	mlx5_ib_mr_pfault_handler(qp, pfault);
				709	}
				710
				711	void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
				712	{
				713	unsigned long flags;
				714
				715	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
				716	qp->disable_page_faults = 1;
				717	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
				718
				719	/*
				720	* Note that at this point, we are guarenteed that no more
				721	* work queue elements will be posted to the work queue with
				722	* the QP we are closing.
				723	*/
				724	flush_workqueue(mlx5_ib_page_fault_wq);
				725	}
				726
				727	void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
				728	{
				729	unsigned long flags;
				730
				731	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
				732	qp->disable_page_faults = 0;
				733	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
				734	}
				735
				736	static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
				737	struct mlx5_pagefault *pfault)
				738	{
				739	/*
				740	* Note that we will only get one fault event per QP per context
				741	* (responder/initiator, read/write), until we resolve the page fault
				742	* with the mlx5_ib_page_fault_resume command. Since this function is
				743	* called from within the work element, there is no risk of missing
				744	* events.
				745	*/
				746	struct mlx5_ib_qp *mibqp = to_mibqp(qp);
				747	enum mlx5_ib_pagefault_context context =
				748	mlx5_ib_get_pagefault_context(pfault);
				749	struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
				750
				751	qp_pfault->mpfault = *pfault;
				752
				753	/* No need to stop interrupts here since we are in an interrupt */
				754	spin_lock(&mibqp->disable_page_faults_lock);
				755	if (!mibqp->disable_page_faults)
				756	queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
				757	spin_unlock(&mibqp->disable_page_faults_lock);
				758	}
				759
				760	void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
				761	{
				762	int i;
				763
				764	qp->disable_page_faults = 1;
				765	spin_lock_init(&qp->disable_page_faults_lock);
				766
majd@mellanox.com	19098df	2016-01-14 19:13:03 +0200	[diff] [blame]	767	qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler;
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	768
				769	for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
				770	INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
				771	}
				772
				773	int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
				774	{
				775	int ret;
				776
				777	ret = init_srcu_struct(&ibdev->mr_srcu);
				778	if (ret)
				779	return ret;
				780
				781	return 0;
				782	}
				783
				784	void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
				785	{
				786	cleanup_srcu_struct(&ibdev->mr_srcu);
				787	}
				788
				789	int __init mlx5_ib_odp_init(void)
				790	{
Bhaktipriya Shridhar	72a36d1	2016-08-15 23:42:19 +0530	[diff] [blame]	791	mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults",
				792	WQ_MEM_RECLAIM);
Haggai Eran	6aec21f	2014-12-11 17:04:23 +0200	[diff] [blame]	793	if (!mlx5_ib_page_fault_wq)
				794	return -ENOMEM;
				795
				796	return 0;
				797	}
				798
				799	void mlx5_ib_odp_cleanup(void)
				800	{
				801	destroy_workqueue(mlx5_ib_page_fault_wq);
				802	}