blob: d4297dc43dc4c834ecfa457f36f221385353ecf2 [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090051#include <linux/slab.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040052
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040053#include "xprt_rdma.h"
54
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040055/*
56 * Globals/Macros
57 */
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63/*
64 * internal functions
65 */
66
67/*
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
71 */
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
Tom Tucker5c635e02011-02-09 19:45:34 +0000147 struct rpcrdma_mw *frmr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400148 struct rpcrdma_rep *rep =
149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152 __func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154 if (!rep) /* send or bind completion that we don't care about */
155 return;
156
157 if (IB_WC_SUCCESS != wc->status) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159 __func__, wc->opcode, wc->status);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400160 rep->rr_len = ~0U;
Tom Tucker5c635e02011-02-09 19:45:34 +0000161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400163 return;
164 }
165
166 switch (wc->opcode) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400175 case IB_WC_RECV:
176 rep->rr_len = wc->byte_len;
177 ib_dma_sync_single_for_cpu(
178 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180 /* Keep (only) the most recent credits, after check validity */
181 if (rep->rr_len >= 16) {
182 struct rpcrdma_msg *p =
183 (struct rpcrdma_msg *) rep->rr_base;
184 unsigned int credits = ntohl(p->rm_credit);
185 if (credits == 0) {
186 dprintk("RPC: %s: server"
187 " dropped credits to 0!\n", __func__);
188 /* don't deadlock */
189 credits = 1;
190 } else if (credits > rep->rr_buffer->rb_max_requests) {
191 dprintk("RPC: %s: server"
192 " over-crediting: %d (%d)\n",
193 __func__, credits,
194 rep->rr_buffer->rb_max_requests);
195 credits = rep->rr_buffer->rb_max_requests;
196 }
197 atomic_set(&rep->rr_buffer->rb_credits, credits);
198 }
199 /* fall through */
200 case IB_WC_BIND_MW:
201 rpcrdma_schedule_tasklet(rep);
202 break;
203 default:
204 dprintk("RPC: %s: unexpected WC event %X\n",
205 __func__, wc->opcode);
206 break;
207 }
208}
209
210static inline int
211rpcrdma_cq_poll(struct ib_cq *cq)
212{
213 struct ib_wc wc;
214 int rc;
215
216 for (;;) {
217 rc = ib_poll_cq(cq, 1, &wc);
218 if (rc < 0) {
219 dprintk("RPC: %s: ib_poll_cq failed %i\n",
220 __func__, rc);
221 return rc;
222 }
223 if (rc == 0)
224 break;
225
226 rpcrdma_event_process(&wc);
227 }
228
229 return 0;
230}
231
232/*
233 * rpcrdma_cq_event_upcall
234 *
235 * This upcall handles recv, send, bind and unbind events.
236 * It is reentrant but processes single events in order to maintain
237 * ordering of receives to keep server credits.
238 *
239 * It is the responsibility of the scheduled tasklet to return
240 * recv buffers to the pool. NOTE: this affects synchronization of
241 * connection shutdown. That is, the structures required for
242 * the completion of the reply handler must remain intact until
243 * all memory has been reclaimed.
244 *
245 * Note that send events are suppressed and do not result in an upcall.
246 */
247static void
248rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249{
250 int rc;
251
252 rc = rpcrdma_cq_poll(cq);
253 if (rc)
254 return;
255
256 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257 if (rc) {
258 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259 __func__, rc);
260 return;
261 }
262
263 rpcrdma_cq_poll(cq);
264}
265
266#ifdef RPC_DEBUG
267static const char * const conn[] = {
268 "address resolved",
269 "address error",
270 "route resolved",
271 "route error",
272 "connect request",
273 "connect response",
274 "connect error",
275 "unreachable",
276 "rejected",
277 "established",
278 "disconnected",
279 "device removal"
280};
281#endif
282
283static int
284rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285{
286 struct rpcrdma_xprt *xprt = id->context;
287 struct rpcrdma_ia *ia = &xprt->rx_ia;
288 struct rpcrdma_ep *ep = &xprt->rx_ep;
Ingo Molnarff0db042008-11-25 16:58:42 -0800289#ifdef RPC_DEBUG
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400290 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
Ingo Molnarff0db042008-11-25 16:58:42 -0800291#endif
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400292 struct ib_qp_attr attr;
293 struct ib_qp_init_attr iattr;
294 int connstate = 0;
295
296 switch (event->event) {
297 case RDMA_CM_EVENT_ADDR_RESOLVED:
298 case RDMA_CM_EVENT_ROUTE_RESOLVED:
Tom Talpey5675add2008-10-09 15:01:41 -0400299 ia->ri_async_rc = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ADDR_ERROR:
303 ia->ri_async_rc = -EHOSTUNREACH;
304 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305 __func__, ep);
306 complete(&ia->ri_done);
307 break;
308 case RDMA_CM_EVENT_ROUTE_ERROR:
309 ia->ri_async_rc = -ENETUNREACH;
310 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311 __func__, ep);
312 complete(&ia->ri_done);
313 break;
314 case RDMA_CM_EVENT_ESTABLISHED:
315 connstate = 1;
316 ib_query_qp(ia->ri_id->qp, &attr,
317 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318 &iattr);
319 dprintk("RPC: %s: %d responder resources"
320 " (%d initiator)\n",
321 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322 goto connected;
323 case RDMA_CM_EVENT_CONNECT_ERROR:
324 connstate = -ENOTCONN;
325 goto connected;
326 case RDMA_CM_EVENT_UNREACHABLE:
327 connstate = -ENETDOWN;
328 goto connected;
329 case RDMA_CM_EVENT_REJECTED:
330 connstate = -ECONNREFUSED;
331 goto connected;
332 case RDMA_CM_EVENT_DISCONNECTED:
333 connstate = -ECONNABORTED;
334 goto connected;
335 case RDMA_CM_EVENT_DEVICE_REMOVAL:
336 connstate = -ENODEV;
337connected:
Harvey Harrison21454aa2008-10-31 00:54:56 -0700338 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400339 __func__,
340 (event->event <= 11) ? conn[event->event] :
341 "unknown connection error",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700342 &addr->sin_addr.s_addr,
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400343 ntohs(addr->sin_port),
344 ep, event->event);
345 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346 dprintk("RPC: %s: %sconnected\n",
347 __func__, connstate > 0 ? "" : "dis");
348 ep->rep_connected = connstate;
349 ep->rep_func(ep);
350 wake_up_all(&ep->rep_connect_wait);
351 break;
352 default:
Tom Talpey1a954052008-10-09 15:01:31 -0400353 dprintk("RPC: %s: unexpected CM event %d\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400354 __func__, event->event);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400355 break;
356 }
357
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400358#ifdef RPC_DEBUG
359 if (connstate == 1) {
360 int ird = attr.max_dest_rd_atomic;
361 int tird = ep->rep_remote_cma.responder_resources;
Harvey Harrison21454aa2008-10-31 00:54:56 -0700362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400363 "on %s, memreg %d slots %d ird %d%s\n",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700364 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400365 ntohs(addr->sin_port),
366 ia->ri_id->device->name,
367 ia->ri_memreg_strategy,
368 xprt->rx_buf.rb_max_requests,
369 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370 } else if (connstate < 0) {
Harvey Harrison21454aa2008-10-31 00:54:56 -0700371 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400373 ntohs(addr->sin_port),
374 connstate);
375 }
376#endif
377
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400378 return 0;
379}
380
381static struct rdma_cm_id *
382rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383 struct rpcrdma_ia *ia, struct sockaddr *addr)
384{
385 struct rdma_cm_id *id;
386 int rc;
387
Tom Talpey1a954052008-10-09 15:01:31 -0400388 init_completion(&ia->ri_done);
389
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400390 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
391 if (IS_ERR(id)) {
392 rc = PTR_ERR(id);
393 dprintk("RPC: %s: rdma_create_id() failed %i\n",
394 __func__, rc);
395 return id;
396 }
397
Tom Talpey5675add2008-10-09 15:01:41 -0400398 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400399 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400 if (rc) {
401 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402 __func__, rc);
403 goto out;
404 }
Tom Talpey5675add2008-10-09 15:01:41 -0400405 wait_for_completion_interruptible_timeout(&ia->ri_done,
406 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400407 rc = ia->ri_async_rc;
408 if (rc)
409 goto out;
410
Tom Talpey5675add2008-10-09 15:01:41 -0400411 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400412 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413 if (rc) {
414 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415 __func__, rc);
416 goto out;
417 }
Tom Talpey5675add2008-10-09 15:01:41 -0400418 wait_for_completion_interruptible_timeout(&ia->ri_done,
419 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400420 rc = ia->ri_async_rc;
421 if (rc)
422 goto out;
423
424 return id;
425
426out:
427 rdma_destroy_id(id);
428 return ERR_PTR(rc);
429}
430
431/*
432 * Drain any cq, prior to teardown.
433 */
434static void
435rpcrdma_clean_cq(struct ib_cq *cq)
436{
437 struct ib_wc wc;
438 int count = 0;
439
440 while (1 == ib_poll_cq(cq, 1, &wc))
441 ++count;
442
443 if (count)
444 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445 __func__, count, wc.opcode);
446}
447
448/*
449 * Exported functions.
450 */
451
452/*
453 * Open and initialize an Interface Adapter.
454 * o initializes fields of struct rpcrdma_ia, including
455 * interface and provider attributes and protection zone.
456 */
457int
458rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459{
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400460 int rc, mem_priv;
461 struct ib_device_attr devattr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400462 struct rpcrdma_ia *ia = &xprt->rx_ia;
463
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400464 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465 if (IS_ERR(ia->ri_id)) {
466 rc = PTR_ERR(ia->ri_id);
467 goto out1;
468 }
469
470 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471 if (IS_ERR(ia->ri_pd)) {
472 rc = PTR_ERR(ia->ri_pd);
473 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474 __func__, rc);
475 goto out2;
476 }
477
478 /*
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400479 * Query the device to determine if the requested memory
480 * registration strategy is supported. If it isn't, set the
481 * strategy to a globally supported model.
482 */
483 rc = ib_query_device(ia->ri_id->device, &devattr);
484 if (rc) {
485 dprintk("RPC: %s: ib_query_device failed %d\n",
486 __func__, rc);
487 goto out2;
488 }
489
490 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491 ia->ri_have_dma_lkey = 1;
492 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493 }
494
495 switch (memreg) {
496 case RPCRDMA_MEMWINDOWS:
497 case RPCRDMA_MEMWINDOWS_ASYNC:
498 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499 dprintk("RPC: %s: MEMWINDOWS registration "
500 "specified but not supported by adapter, "
501 "using slower RPCRDMA_REGISTER\n",
502 __func__);
503 memreg = RPCRDMA_REGISTER;
504 }
505 break;
506 case RPCRDMA_MTHCAFMR:
507 if (!ia->ri_id->device->alloc_fmr) {
508#if RPCRDMA_PERSISTENT_REGISTRATION
509 dprintk("RPC: %s: MTHCAFMR registration "
510 "specified but not supported by adapter, "
511 "using riskier RPCRDMA_ALLPHYSICAL\n",
512 __func__);
513 memreg = RPCRDMA_ALLPHYSICAL;
514#else
515 dprintk("RPC: %s: MTHCAFMR registration "
516 "specified but not supported by adapter, "
517 "using slower RPCRDMA_REGISTER\n",
518 __func__);
519 memreg = RPCRDMA_REGISTER;
520#endif
521 }
522 break;
Tom Talpey3197d3092008-10-09 15:00:20 -0400523 case RPCRDMA_FRMR:
524 /* Requires both frmr reg and local dma lkey */
525 if ((devattr.device_cap_flags &
526 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528#if RPCRDMA_PERSISTENT_REGISTRATION
529 dprintk("RPC: %s: FRMR registration "
530 "specified but not supported by adapter, "
531 "using riskier RPCRDMA_ALLPHYSICAL\n",
532 __func__);
533 memreg = RPCRDMA_ALLPHYSICAL;
534#else
535 dprintk("RPC: %s: FRMR registration "
536 "specified but not supported by adapter, "
537 "using slower RPCRDMA_REGISTER\n",
538 __func__);
539 memreg = RPCRDMA_REGISTER;
540#endif
541 }
542 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400543 }
544
545 /*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400546 * Optionally obtain an underlying physical identity mapping in
547 * order to do a memory window-based bind. This base registration
548 * is protected from remote access - that is enabled only by binding
549 * for the specific bytes targeted during each RPC operation, and
550 * revoked after the corresponding completion similar to a storage
551 * adapter.
552 */
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400553 switch (memreg) {
554 case RPCRDMA_BOUNCEBUFFERS:
555 case RPCRDMA_REGISTER:
Tom Talpey3197d3092008-10-09 15:00:20 -0400556 case RPCRDMA_FRMR:
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400557 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400558#if RPCRDMA_PERSISTENT_REGISTRATION
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400559 case RPCRDMA_ALLPHYSICAL:
560 mem_priv = IB_ACCESS_LOCAL_WRITE |
561 IB_ACCESS_REMOTE_WRITE |
562 IB_ACCESS_REMOTE_READ;
563 goto register_setup;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400564#endif
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400565 case RPCRDMA_MEMWINDOWS_ASYNC:
566 case RPCRDMA_MEMWINDOWS:
567 mem_priv = IB_ACCESS_LOCAL_WRITE |
568 IB_ACCESS_MW_BIND;
569 goto register_setup;
570 case RPCRDMA_MTHCAFMR:
571 if (ia->ri_have_dma_lkey)
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400572 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400573 mem_priv = IB_ACCESS_LOCAL_WRITE;
574 register_setup:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400575 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576 if (IS_ERR(ia->ri_bind_mem)) {
577 printk(KERN_ALERT "%s: ib_get_dma_mr for "
578 "phys register failed with %lX\n\t"
579 "Will continue with degraded performance\n",
580 __func__, PTR_ERR(ia->ri_bind_mem));
581 memreg = RPCRDMA_REGISTER;
582 ia->ri_bind_mem = NULL;
583 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400584 break;
585 default:
586 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587 __func__, memreg);
588 rc = -EINVAL;
589 goto out2;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400590 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400591 dprintk("RPC: %s: memory registration strategy is %d\n",
592 __func__, memreg);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400593
594 /* Else will do memory reg/dereg for each chunk */
595 ia->ri_memreg_strategy = memreg;
596
597 return 0;
598out2:
599 rdma_destroy_id(ia->ri_id);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400600 ia->ri_id = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400601out1:
602 return rc;
603}
604
605/*
606 * Clean up/close an IA.
607 * o if event handles and PD have been initialized, free them.
608 * o close the IA
609 */
610void
611rpcrdma_ia_close(struct rpcrdma_ia *ia)
612{
613 int rc;
614
615 dprintk("RPC: %s: entering\n", __func__);
616 if (ia->ri_bind_mem != NULL) {
617 rc = ib_dereg_mr(ia->ri_bind_mem);
618 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619 __func__, rc);
620 }
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400621 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622 if (ia->ri_id->qp)
623 rdma_destroy_qp(ia->ri_id);
624 rdma_destroy_id(ia->ri_id);
625 ia->ri_id = NULL;
626 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400627 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628 rc = ib_dealloc_pd(ia->ri_pd);
629 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630 __func__, rc);
631 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400632}
633
634/*
635 * Create unconnected endpoint.
636 */
637int
638rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639 struct rpcrdma_create_data_internal *cdata)
640{
641 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400642 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400643
644 rc = ib_query_device(ia->ri_id->device, &devattr);
645 if (rc) {
646 dprintk("RPC: %s: ib_query_device failed %d\n",
647 __func__, rc);
648 return rc;
649 }
650
651 /* check provider's send/recv wr limits */
652 if (cdata->max_requests > devattr.max_qp_wr)
653 cdata->max_requests = devattr.max_qp_wr;
654
655 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656 ep->rep_attr.qp_context = ep;
657 /* send_cq and recv_cq initialized below */
658 ep->rep_attr.srq = NULL;
659 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400661 case RPCRDMA_FRMR:
Tom Tucker15cdc6442010-08-11 12:47:24 -0400662 /* Add room for frmr register and invalidate WRs.
663 * 1. FRMR reg WR for head
664 * 2. FRMR invalidate WR for head
665 * 3. FRMR reg WR for pagelist
666 * 4. FRMR invalidate WR for pagelist
667 * 5. FRMR reg WR for tail
668 * 6. FRMR invalidate WR for tail
669 * 7. The RDMA_SEND WR
670 */
671 ep->rep_attr.cap.max_send_wr *= 7;
672 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673 cdata->max_requests = devattr.max_qp_wr / 7;
674 if (!cdata->max_requests)
675 return -EINVAL;
676 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677 }
Tom Talpey3197d3092008-10-09 15:00:20 -0400678 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400679 case RPCRDMA_MEMWINDOWS_ASYNC:
680 case RPCRDMA_MEMWINDOWS:
681 /* Add room for mw_binds+unbinds - overkill! */
682 ep->rep_attr.cap.max_send_wr++;
683 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685 return -EINVAL;
686 break;
687 default:
688 break;
689 }
690 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692 ep->rep_attr.cap.max_recv_sge = 1;
693 ep->rep_attr.cap.max_inline_data = 0;
694 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695 ep->rep_attr.qp_type = IB_QPT_RC;
696 ep->rep_attr.port_num = ~0;
697
698 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699 "iovs: send %d recv %d\n",
700 __func__,
701 ep->rep_attr.cap.max_send_wr,
702 ep->rep_attr.cap.max_recv_wr,
703 ep->rep_attr.cap.max_send_sge,
704 ep->rep_attr.cap.max_recv_sge);
705
706 /* set trigger for requesting send completion */
707 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
708 switch (ia->ri_memreg_strategy) {
709 case RPCRDMA_MEMWINDOWS_ASYNC:
710 case RPCRDMA_MEMWINDOWS:
711 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712 break;
713 default:
714 break;
715 }
716 if (ep->rep_cqinit <= 2)
717 ep->rep_cqinit = 0;
718 INIT_CQCOUNT(ep);
719 ep->rep_ia = ia;
720 init_waitqueue_head(&ep->rep_connect_wait);
721
722 /*
723 * Create a single cq for receive dto and mw_bind (only ever
724 * care about unbind, really). Send completions are suppressed.
725 * Use single threaded tasklet upcalls to maintain ordering.
726 */
727 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728 rpcrdma_cq_async_error_upcall, NULL,
729 ep->rep_attr.cap.max_recv_wr +
730 ep->rep_attr.cap.max_send_wr + 1, 0);
731 if (IS_ERR(ep->rep_cq)) {
732 rc = PTR_ERR(ep->rep_cq);
733 dprintk("RPC: %s: ib_create_cq failed: %i\n",
734 __func__, rc);
735 goto out1;
736 }
737
738 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739 if (rc) {
740 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741 __func__, rc);
742 goto out2;
743 }
744
745 ep->rep_attr.send_cq = ep->rep_cq;
746 ep->rep_attr.recv_cq = ep->rep_cq;
747
748 /* Initialize cma parameters */
749
750 /* RPC/RDMA does not use private data */
751 ep->rep_remote_cma.private_data = NULL;
752 ep->rep_remote_cma.private_data_len = 0;
753
754 /* Client offers RDMA Read but does not initiate */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400755 ep->rep_remote_cma.initiator_depth = 0;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400756 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757 ep->rep_remote_cma.responder_resources = 0;
758 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
759 ep->rep_remote_cma.responder_resources = 32;
760 else
761 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400762
763 ep->rep_remote_cma.retry_count = 7;
764 ep->rep_remote_cma.flow_control = 0;
765 ep->rep_remote_cma.rnr_retry_count = 0;
766
767 return 0;
768
769out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400770 err = ib_destroy_cq(ep->rep_cq);
771 if (err)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400774out1:
775 return rc;
776}
777
778/*
779 * rpcrdma_ep_destroy
780 *
781 * Disconnect and destroy endpoint. After this, the only
782 * valid operations on the ep are to free it (if dynamically
783 * allocated) or re-create it.
784 *
785 * The caller's error handling must be sure to not leak the endpoint
786 * if this function fails.
787 */
788int
789rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790{
791 int rc;
792
793 dprintk("RPC: %s: entering, connected is %d\n",
794 __func__, ep->rep_connected);
795
796 if (ia->ri_id->qp) {
797 rc = rpcrdma_ep_disconnect(ep, ia);
798 if (rc)
799 dprintk("RPC: %s: rpcrdma_ep_disconnect"
800 " returned %i\n", __func__, rc);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400801 rdma_destroy_qp(ia->ri_id);
802 ia->ri_id->qp = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400803 }
804
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400805 /* padding - could be done in rpcrdma_buffer_destroy... */
806 if (ep->rep_pad_mr) {
807 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808 ep->rep_pad_mr = NULL;
809 }
810
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400811 rpcrdma_clean_cq(ep->rep_cq);
812 rc = ib_destroy_cq(ep->rep_cq);
813 if (rc)
814 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815 __func__, rc);
816
817 return rc;
818}
819
820/*
821 * Connect unconnected endpoint.
822 */
823int
824rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825{
826 struct rdma_cm_id *id;
827 int rc = 0;
828 int retry_count = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400829
Tom Talpeyc0555512008-10-10 11:32:45 -0400830 if (ep->rep_connected != 0) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400831 struct rpcrdma_xprt *xprt;
832retry:
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc && rc != -ENOTCONN)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " status %i\n", __func__, rc);
837 rpcrdma_clean_cq(ep->rep_cq);
838
839 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840 id = rpcrdma_create_id(xprt, ia,
841 (struct sockaddr *)&xprt->rx_data.addr);
842 if (IS_ERR(id)) {
843 rc = PTR_ERR(id);
844 goto out;
845 }
846 /* TEMP TEMP TEMP - fail if new device:
847 * Deregister/remarshal *all* requests!
848 * Close and recreate adapter, pd, etc!
849 * Re-determine all attributes still sane!
850 * More stuff I haven't thought of!
851 * Rrrgh!
852 */
853 if (ia->ri_id->device != id->device) {
854 printk("RPC: %s: can't reconnect on "
855 "different device!\n", __func__);
856 rdma_destroy_id(id);
857 rc = -ENETDOWN;
858 goto out;
859 }
860 /* END TEMP */
Tom Talpey1a954052008-10-09 15:01:31 -0400861 rdma_destroy_qp(ia->ri_id);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400862 rdma_destroy_id(ia->ri_id);
863 ia->ri_id = id;
864 }
865
866 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867 if (rc) {
868 dprintk("RPC: %s: rdma_create_qp failed %i\n",
869 __func__, rc);
870 goto out;
871 }
872
873/* XXX Tavor device performs badly with 2K MTU! */
874if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879 struct ib_qp_attr attr = {
880 .path_mtu = IB_MTU_1024
881 };
882 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883 }
884}
885
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400886 ep->rep_connected = 0;
887
888 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889 if (rc) {
890 dprintk("RPC: %s: rdma_connect() failed with %i\n",
891 __func__, rc);
892 goto out;
893 }
894
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400895 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897 /*
898 * Check state. A non-peer reject indicates no listener
899 * (ECONNREFUSED), which may be a transient state. All
900 * others indicate a transport condition which has already
901 * undergone a best-effort.
902 */
Joe Perchesf64f9e72009-11-29 16:55:45 -0800903 if (ep->rep_connected == -ECONNREFUSED &&
904 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400905 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906 goto retry;
907 }
908 if (ep->rep_connected <= 0) {
909 /* Sometimes, the only way to reliably connect to remote
910 * CMs is to use same nonzero values for ORD and IRD. */
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400911 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912 (ep->rep_remote_cma.responder_resources == 0 ||
913 ep->rep_remote_cma.initiator_depth !=
914 ep->rep_remote_cma.responder_resources)) {
915 if (ep->rep_remote_cma.responder_resources == 0)
916 ep->rep_remote_cma.responder_resources = 1;
917 ep->rep_remote_cma.initiator_depth =
918 ep->rep_remote_cma.responder_resources;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400919 goto retry;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400920 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400921 rc = ep->rep_connected;
922 } else {
923 dprintk("RPC: %s: connected\n", __func__);
924 }
925
926out:
927 if (rc)
928 ep->rep_connected = rc;
929 return rc;
930}
931
932/*
933 * rpcrdma_ep_disconnect
934 *
935 * This is separate from destroy to facilitate the ability
936 * to reconnect without recreating the endpoint.
937 *
938 * This call is not reentrant, and must not be made in parallel
939 * on the same endpoint.
940 */
941int
942rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943{
944 int rc;
945
946 rpcrdma_clean_cq(ep->rep_cq);
947 rc = rdma_disconnect(ia->ri_id);
948 if (!rc) {
949 /* returns without wait if not connected */
950 wait_event_interruptible(ep->rep_connect_wait,
951 ep->rep_connected != 1);
952 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953 (ep->rep_connected == 1) ? "still " : "dis");
954 } else {
955 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956 ep->rep_connected = rc;
957 }
958 return rc;
959}
960
961/*
962 * Initialize buffer memory
963 */
964int
965rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967{
968 char *p;
969 size_t len;
970 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400971 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400972
973 buf->rb_max_requests = cdata->max_requests;
974 spin_lock_init(&buf->rb_lock);
975 atomic_set(&buf->rb_credits, 1);
976
977 /* Need to allocate:
978 * 1. arrays for send and recv pointers
979 * 2. arrays of struct rpcrdma_req to fill in pointers
980 * 3. array of struct rpcrdma_rep for replies
981 * 4. padding, if any
Tom Talpey3197d3092008-10-09 15:00:20 -0400982 * 5. mw's, fmr's or frmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400983 * Send/recv buffers in req/rep need to be registered
984 */
985
986 len = buf->rb_max_requests *
987 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988 len += cdata->padding;
989 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400990 case RPCRDMA_FRMR:
991 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992 sizeof(struct rpcrdma_mw);
993 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400994 case RPCRDMA_MTHCAFMR:
995 /* TBD we are perhaps overallocating here */
996 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MEMWINDOWS_ASYNC:
1000 case RPCRDMA_MEMWINDOWS:
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 default:
1005 break;
1006 }
1007
1008 /* allocate 1, 4 and 5 in one shot */
1009 p = kzalloc(len, GFP_KERNEL);
1010 if (p == NULL) {
1011 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012 __func__, len);
1013 rc = -ENOMEM;
1014 goto out;
1015 }
1016 buf->rb_pool = p; /* for freeing it later */
1017
1018 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023 /*
1024 * Register the zeroed pad buffer, if any.
1025 */
1026 if (cdata->padding) {
1027 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028 &ep->rep_pad_mr, &ep->rep_pad);
1029 if (rc)
1030 goto out;
1031 }
1032 p += cdata->padding;
1033
1034 /*
1035 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036 * We "cycle" the mw's in order to minimize rkey reuse,
1037 * and also reduce unbind-to-bind collision.
1038 */
1039 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001040 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001041 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001042 case RPCRDMA_FRMR:
1043 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_mr)) {
1047 rc = PTR_ERR(r->r.frmr.fr_mr);
1048 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049 " failed %i\n", __func__, rc);
1050 goto out;
1051 }
1052 r->r.frmr.fr_pgl =
1053 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054 RPCRDMA_MAX_SEGS);
1055 if (IS_ERR(r->r.frmr.fr_pgl)) {
1056 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057 dprintk("RPC: %s: "
1058 "ib_alloc_fast_reg_page_list "
1059 "failed %i\n", __func__, rc);
1060 goto out;
1061 }
1062 list_add(&r->mw_list, &buf->rb_mws);
1063 ++r;
1064 }
1065 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001066 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001067 /* TBD we are perhaps overallocating here */
1068 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001069 static struct ib_fmr_attr fa =
1070 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001071 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073 &fa);
1074 if (IS_ERR(r->r.fmr)) {
1075 rc = PTR_ERR(r->r.fmr);
1076 dprintk("RPC: %s: ib_alloc_fmr"
1077 " failed %i\n", __func__, rc);
1078 goto out;
1079 }
1080 list_add(&r->mw_list, &buf->rb_mws);
1081 ++r;
1082 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001083 break;
1084 case RPCRDMA_MEMWINDOWS_ASYNC:
1085 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001086 /* Allocate one extra request's worth, for full cycling */
1087 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088 r->r.mw = ib_alloc_mw(ia->ri_pd);
1089 if (IS_ERR(r->r.mw)) {
1090 rc = PTR_ERR(r->r.mw);
1091 dprintk("RPC: %s: ib_alloc_mw"
1092 " failed %i\n", __func__, rc);
1093 goto out;
1094 }
1095 list_add(&r->mw_list, &buf->rb_mws);
1096 ++r;
1097 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001098 break;
1099 default:
1100 break;
1101 }
1102
1103 /*
1104 * Allocate/init the request/reply buffers. Doing this
1105 * using kmalloc for now -- one for each buf.
1106 */
1107 for (i = 0; i < buf->rb_max_requests; i++) {
1108 struct rpcrdma_req *req;
1109 struct rpcrdma_rep *rep;
1110
1111 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113 /* Typical ~2400b, so rounding up saves work later */
1114 if (len < 4096)
1115 len = 4096;
1116 req = kmalloc(len, GFP_KERNEL);
1117 if (req == NULL) {
1118 dprintk("RPC: %s: request buffer %d alloc"
1119 " failed\n", __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 memset(req, 0, sizeof(struct rpcrdma_req));
1124 buf->rb_send_bufs[i] = req;
1125 buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127 rc = rpcrdma_register_internal(ia, req->rl_base,
1128 len - offsetof(struct rpcrdma_req, rl_base),
1129 &buf->rb_send_bufs[i]->rl_handle,
1130 &buf->rb_send_bufs[i]->rl_iov);
1131 if (rc)
1132 goto out;
1133
1134 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137 rep = kmalloc(len, GFP_KERNEL);
1138 if (rep == NULL) {
1139 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140 __func__, i);
1141 rc = -ENOMEM;
1142 goto out;
1143 }
1144 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145 buf->rb_recv_bufs[i] = rep;
1146 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147 init_waitqueue_head(&rep->rr_unbind);
1148
1149 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150 len - offsetof(struct rpcrdma_rep, rr_base),
1151 &buf->rb_recv_bufs[i]->rr_handle,
1152 &buf->rb_recv_bufs[i]->rr_iov);
1153 if (rc)
1154 goto out;
1155
1156 }
1157 dprintk("RPC: %s: max_requests %d\n",
1158 __func__, buf->rb_max_requests);
1159 /* done */
1160 return 0;
1161out:
1162 rpcrdma_buffer_destroy(buf);
1163 return rc;
1164}
1165
1166/*
1167 * Unregister and destroy buffer memory. Need to deal with
1168 * partial initialization, so it's callable from failed create.
1169 * Must be called before destroying endpoint, as registrations
1170 * reference it.
1171 */
1172void
1173rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174{
1175 int rc, i;
1176 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001177 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001178
1179 /* clean up in reverse order from create
1180 * 1. recv mr memory (mr free, then kfree)
1181 * 1a. bind mw memory
1182 * 2. send mr memory (mr free, then kfree)
1183 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1184 * 4. arrays
1185 */
1186 dprintk("RPC: %s: entering\n", __func__);
1187
1188 for (i = 0; i < buf->rb_max_requests; i++) {
1189 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190 rpcrdma_deregister_internal(ia,
1191 buf->rb_recv_bufs[i]->rr_handle,
1192 &buf->rb_recv_bufs[i]->rr_iov);
1193 kfree(buf->rb_recv_bufs[i]);
1194 }
1195 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196 while (!list_empty(&buf->rb_mws)) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001197 r = list_entry(buf->rb_mws.next,
1198 struct rpcrdma_mw, mw_list);
1199 list_del(&r->mw_list);
1200 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001201 case RPCRDMA_FRMR:
1202 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dereg_mr"
1206 " failed %i\n",
1207 __func__, rc);
1208 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001210 case RPCRDMA_MTHCAFMR:
1211 rc = ib_dealloc_fmr(r->r.fmr);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_fmr"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 case RPCRDMA_MEMWINDOWS_ASYNC:
1219 case RPCRDMA_MEMWINDOWS:
1220 rc = ib_dealloc_mw(r->r.mw);
1221 if (rc)
1222 dprintk("RPC: %s:"
1223 " ib_dealloc_mw"
1224 " failed %i\n",
1225 __func__, rc);
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 rpcrdma_deregister_internal(ia,
1232 buf->rb_send_bufs[i]->rl_handle,
1233 &buf->rb_send_bufs[i]->rl_iov);
1234 kfree(buf->rb_send_bufs[i]);
1235 }
1236 }
1237
1238 kfree(buf->rb_pool);
1239}
1240
1241/*
1242 * Get a set of request/reply buffers.
1243 *
1244 * Reply buffer (if needed) is attached to send buffer upon return.
1245 * Rule:
1246 * rb_send_index and rb_recv_index MUST always be pointing to the
1247 * *next* available buffer (non-NULL). They are incremented after
1248 * removing buffers, and decremented *before* returning them.
1249 */
1250struct rpcrdma_req *
1251rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252{
1253 struct rpcrdma_req *req;
1254 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001255 int i;
1256 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001257
1258 spin_lock_irqsave(&buffers->rb_lock, flags);
1259 if (buffers->rb_send_index == buffers->rb_max_requests) {
1260 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261 dprintk("RPC: %s: out of request buffers\n", __func__);
1262 return ((struct rpcrdma_req *)NULL);
1263 }
1264
1265 req = buffers->rb_send_bufs[buffers->rb_send_index];
1266 if (buffers->rb_send_index < buffers->rb_recv_index) {
1267 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268 __func__,
1269 buffers->rb_recv_index - buffers->rb_send_index);
1270 req->rl_reply = NULL;
1271 } else {
1272 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274 }
1275 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001277 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001278 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001279 r = list_entry(buffers->rb_mws.next,
1280 struct rpcrdma_mw, mw_list);
1281 list_del(&r->mw_list);
1282 req->rl_segments[i].mr_chunk.rl_mw = r;
1283 } while (--i >= 0);
1284 }
1285 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286 return req;
1287}
1288
1289/*
1290 * Put request/reply buffers back into pool.
1291 * Pre-decrement counter/array index.
1292 */
1293void
1294rpcrdma_buffer_put(struct rpcrdma_req *req)
1295{
1296 struct rpcrdma_buffer *buffers = req->rl_buffer;
1297 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298 int i;
1299 unsigned long flags;
1300
1301 BUG_ON(req->rl_nchunks != 0);
1302 spin_lock_irqsave(&buffers->rb_lock, flags);
1303 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304 req->rl_niovs = 0;
1305 if (req->rl_reply) {
1306 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308 req->rl_reply->rr_func = NULL;
1309 req->rl_reply = NULL;
1310 }
1311 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001312 case RPCRDMA_FRMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001313 case RPCRDMA_MTHCAFMR:
1314 case RPCRDMA_MEMWINDOWS_ASYNC:
1315 case RPCRDMA_MEMWINDOWS:
1316 /*
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1319 */
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337/*
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1341 */
1342void
1343rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344{
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1347
1348 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354 }
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356}
1357
1358/*
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions, and when
1361 * aborting unbinds. Pre-decrement counter/array index.
1362 */
1363void
1364rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365{
1366 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367 unsigned long flags;
1368
1369 rep->rr_func = NULL;
1370 spin_lock_irqsave(&buffers->rb_lock, flags);
1371 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373}
1374
1375/*
1376 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1377 */
1378
1379int
1380rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381 struct ib_mr **mrp, struct ib_sge *iov)
1382{
1383 struct ib_phys_buf ipb;
1384 struct ib_mr *mr;
1385 int rc;
1386
1387 /*
1388 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1389 */
1390 iov->addr = ib_dma_map_single(ia->ri_id->device,
1391 va, len, DMA_BIDIRECTIONAL);
1392 iov->length = len;
1393
Tom Talpeybd7ed1d2008-10-09 15:00:09 -04001394 if (ia->ri_have_dma_lkey) {
1395 *mrp = NULL;
1396 iov->lkey = ia->ri_dma_lkey;
1397 return 0;
1398 } else if (ia->ri_bind_mem != NULL) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001399 *mrp = NULL;
1400 iov->lkey = ia->ri_bind_mem->lkey;
1401 return 0;
1402 }
1403
1404 ipb.addr = iov->addr;
1405 ipb.size = iov->length;
1406 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409 dprintk("RPC: %s: phys convert: 0x%llx "
1410 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001411 __func__, (unsigned long long)ipb.addr,
1412 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001413
1414 if (IS_ERR(mr)) {
1415 *mrp = NULL;
1416 rc = PTR_ERR(mr);
1417 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418 } else {
1419 *mrp = mr;
1420 iov->lkey = mr->lkey;
1421 rc = 0;
1422 }
1423
1424 return rc;
1425}
1426
1427int
1428rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429 struct ib_mr *mr, struct ib_sge *iov)
1430{
1431 int rc;
1432
1433 ib_dma_unmap_single(ia->ri_id->device,
1434 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436 if (NULL == mr)
1437 return 0;
1438
1439 rc = ib_dereg_mr(mr);
1440 if (rc)
1441 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442 return rc;
1443}
1444
1445/*
1446 * Wrappers for chunk registration, shared by read/write chunk code.
1447 */
1448
1449static void
1450rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451{
1452 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453 seg->mr_dmalen = seg->mr_len;
1454 if (seg->mr_page)
1455 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456 seg->mr_page, offset_in_page(seg->mr_offset),
1457 seg->mr_dmalen, seg->mr_dir);
1458 else
1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460 seg->mr_offset,
1461 seg->mr_dmalen, seg->mr_dir);
Tom Tucker5c635e02011-02-09 19:45:34 +00001462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
Randy Dunlap986d4ab2011-03-15 17:11:59 -07001465 (unsigned long long)seg->mr_dma,
1466 seg->mr_offset, seg->mr_dmalen);
Tom Tucker5c635e02011-02-09 19:45:34 +00001467 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001468}
1469
1470static void
1471rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472{
1473 if (seg->mr_page)
1474 ib_dma_unmap_page(ia->ri_id->device,
1475 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476 else
1477 ib_dma_unmap_single(ia->ri_id->device,
1478 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479}
1480
Tom Talpey8d4ba032008-10-09 14:59:49 -04001481static int
Tom Talpey3197d3092008-10-09 15:00:20 -04001482rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483 int *nsegs, int writing, struct rpcrdma_ia *ia,
1484 struct rpcrdma_xprt *r_xprt)
1485{
1486 struct rpcrdma_mr_seg *seg1 = seg;
Tom Tucker5c635e02011-02-09 19:45:34 +00001487 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
Tom Talpey3197d3092008-10-09 15:00:20 -04001489 u8 key;
1490 int len, pageoff;
1491 int i, rc;
1492
1493 pageoff = offset_in_page(seg1->mr_offset);
1494 seg1->mr_offset -= pageoff; /* start of page */
1495 seg1->mr_len += pageoff;
1496 len = -pageoff;
1497 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499 for (i = 0; i < *nsegs;) {
1500 rpcrdma_map_one(ia, seg, writing);
1501 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502 len += seg->mr_len;
Tom Tucker5c635e02011-02-09 19:45:34 +00001503 BUG_ON(seg->mr_len > PAGE_SIZE);
Tom Talpey3197d3092008-10-09 15:00:20 -04001504 ++seg;
1505 ++i;
1506 /* Check for holes */
1507 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509 break;
1510 }
1511 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1512 __func__, seg1->mr_chunk.rl_mw, i);
1513
Tom Tucker5c635e02011-02-09 19:45:34 +00001514 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516 __func__,
1517 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518 /* Invalidate before using. */
1519 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521 invalidate_wr.next = &frmr_wr;
1522 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524 invalidate_wr.ex.invalidate_rkey =
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526 DECR_CQCOUNT(&r_xprt->rx_ep);
1527 post_wr = &invalidate_wr;
1528 } else
1529 post_wr = &frmr_wr;
1530
Tom Talpey3197d3092008-10-09 15:00:20 -04001531 /* Bump the key */
1532 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535 /* Prepare FRMR WR */
1536 memset(&frmr_wr, 0, sizeof frmr_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001537 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001538 frmr_wr.opcode = IB_WR_FAST_REG_MR;
Tom Tucker5c635e02011-02-09 19:45:34 +00001539 frmr_wr.send_flags = IB_SEND_SIGNALED;
Steve Wise7a8b80eb2010-08-11 12:47:08 -04001540 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
Tom Talpey3197d3092008-10-09 15:00:20 -04001541 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542 frmr_wr.wr.fast_reg.page_list_len = i;
1543 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
Tom Tucker5c635e02011-02-09 19:45:34 +00001545 BUG_ON(frmr_wr.wr.fast_reg.length < len);
Tom Talpey3197d3092008-10-09 15:00:20 -04001546 frmr_wr.wr.fast_reg.access_flags = (writing ?
Vu Pham68743082009-05-26 14:51:00 -04001547 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548 IB_ACCESS_REMOTE_READ);
Tom Talpey3197d3092008-10-09 15:00:20 -04001549 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550 DECR_CQCOUNT(&r_xprt->rx_ep);
1551
Tom Tucker5c635e02011-02-09 19:45:34 +00001552 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
Tom Talpey3197d3092008-10-09 15:00:20 -04001553
1554 if (rc) {
1555 dprintk("RPC: %s: failed ib_post_send for register,"
1556 " status %i\n", __func__, rc);
1557 while (i--)
1558 rpcrdma_unmap_one(ia, --seg);
1559 } else {
1560 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561 seg1->mr_base = seg1->mr_dma + pageoff;
1562 seg1->mr_nsegs = i;
1563 seg1->mr_len = len;
1564 }
1565 *nsegs = i;
1566 return rc;
1567}
1568
1569static int
1570rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572{
1573 struct rpcrdma_mr_seg *seg1 = seg;
1574 struct ib_send_wr invalidate_wr, *bad_wr;
1575 int rc;
1576
1577 while (seg1->mr_nsegs--)
1578 rpcrdma_unmap_one(ia, seg++);
1579
1580 memset(&invalidate_wr, 0, sizeof invalidate_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001581 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001582 invalidate_wr.opcode = IB_WR_LOCAL_INV;
Tom Tucker5c635e02011-02-09 19:45:34 +00001583 invalidate_wr.send_flags = IB_SEND_SIGNALED;
Tom Talpey3197d3092008-10-09 15:00:20 -04001584 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585 DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588 if (rc)
1589 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1590 " status %i\n", __func__, rc);
1591 return rc;
1592}
1593
1594static int
Tom Talpey8d4ba032008-10-09 14:59:49 -04001595rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596 int *nsegs, int writing, struct rpcrdma_ia *ia)
1597{
1598 struct rpcrdma_mr_seg *seg1 = seg;
1599 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600 int len, pageoff, i, rc;
1601
1602 pageoff = offset_in_page(seg1->mr_offset);
1603 seg1->mr_offset -= pageoff; /* start of page */
1604 seg1->mr_len += pageoff;
1605 len = -pageoff;
1606 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608 for (i = 0; i < *nsegs;) {
1609 rpcrdma_map_one(ia, seg, writing);
1610 physaddrs[i] = seg->mr_dma;
1611 len += seg->mr_len;
1612 ++seg;
1613 ++i;
1614 /* Check for holes */
1615 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617 break;
1618 }
1619 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620 physaddrs, i, seg1->mr_dma);
1621 if (rc) {
1622 dprintk("RPC: %s: failed ib_map_phys_fmr "
1623 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624 len, (unsigned long long)seg1->mr_dma,
1625 pageoff, i, rc);
1626 while (i--)
1627 rpcrdma_unmap_one(ia, --seg);
1628 } else {
1629 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630 seg1->mr_base = seg1->mr_dma + pageoff;
1631 seg1->mr_nsegs = i;
1632 seg1->mr_len = len;
1633 }
1634 *nsegs = i;
1635 return rc;
1636}
1637
1638static int
1639rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640 struct rpcrdma_ia *ia)
1641{
1642 struct rpcrdma_mr_seg *seg1 = seg;
1643 LIST_HEAD(l);
1644 int rc;
1645
1646 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647 rc = ib_unmap_fmr(&l);
1648 while (seg1->mr_nsegs--)
1649 rpcrdma_unmap_one(ia, seg++);
1650 if (rc)
1651 dprintk("RPC: %s: failed ib_unmap_fmr,"
1652 " status %i\n", __func__, rc);
1653 return rc;
1654}
1655
1656static int
1657rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia,
1659 struct rpcrdma_xprt *r_xprt)
1660{
1661 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662 IB_ACCESS_REMOTE_READ);
1663 struct ib_mw_bind param;
1664 int rc;
1665
1666 *nsegs = 1;
1667 rpcrdma_map_one(ia, seg, writing);
1668 param.mr = ia->ri_bind_mem;
1669 param.wr_id = 0ULL; /* no send cookie */
1670 param.addr = seg->mr_dma;
1671 param.length = seg->mr_len;
1672 param.send_flags = 0;
1673 param.mw_access_flags = mem_priv;
1674
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677 if (rc) {
1678 dprintk("RPC: %s: failed ib_bind_mw "
1679 "%u@0x%llx status %i\n",
1680 __func__, seg->mr_len,
1681 (unsigned long long)seg->mr_dma, rc);
1682 rpcrdma_unmap_one(ia, seg);
1683 } else {
1684 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685 seg->mr_base = param.addr;
1686 seg->mr_nsegs = 1;
1687 }
1688 return rc;
1689}
1690
1691static int
1692rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693 struct rpcrdma_ia *ia,
1694 struct rpcrdma_xprt *r_xprt, void **r)
1695{
1696 struct ib_mw_bind param;
1697 LIST_HEAD(l);
1698 int rc;
1699
1700 BUG_ON(seg->mr_nsegs != 1);
1701 param.mr = ia->ri_bind_mem;
1702 param.addr = 0ULL; /* unbind */
1703 param.length = 0;
1704 param.mw_access_flags = 0;
1705 if (*r) {
1706 param.wr_id = (u64) (unsigned long) *r;
1707 param.send_flags = IB_SEND_SIGNALED;
1708 INIT_CQCOUNT(&r_xprt->rx_ep);
1709 } else {
1710 param.wr_id = 0ULL;
1711 param.send_flags = 0;
1712 DECR_CQCOUNT(&r_xprt->rx_ep);
1713 }
1714 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715 rpcrdma_unmap_one(ia, seg);
1716 if (rc)
1717 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1718 " status %i\n", __func__, rc);
1719 else
1720 *r = NULL; /* will upcall on completion */
1721 return rc;
1722}
1723
1724static int
1725rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726 int *nsegs, int writing, struct rpcrdma_ia *ia)
1727{
1728 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729 IB_ACCESS_REMOTE_READ);
1730 struct rpcrdma_mr_seg *seg1 = seg;
1731 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732 int len, i, rc = 0;
1733
1734 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736 for (len = 0, i = 0; i < *nsegs;) {
1737 rpcrdma_map_one(ia, seg, writing);
1738 ipb[i].addr = seg->mr_dma;
1739 ipb[i].size = seg->mr_len;
1740 len += seg->mr_len;
1741 ++seg;
1742 ++i;
1743 /* Check for holes */
1744 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746 break;
1747 }
1748 seg1->mr_base = seg1->mr_dma;
1749 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750 ipb, i, mem_priv, &seg1->mr_base);
1751 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753 dprintk("RPC: %s: failed ib_reg_phys_mr "
1754 "%u@0x%llx (%d)... status %i\n",
1755 __func__, len,
1756 (unsigned long long)seg1->mr_dma, i, rc);
1757 while (i--)
1758 rpcrdma_unmap_one(ia, --seg);
1759 } else {
1760 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761 seg1->mr_nsegs = i;
1762 seg1->mr_len = len;
1763 }
1764 *nsegs = i;
1765 return rc;
1766}
1767
1768static int
1769rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770 struct rpcrdma_ia *ia)
1771{
1772 struct rpcrdma_mr_seg *seg1 = seg;
1773 int rc;
1774
1775 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776 seg1->mr_chunk.rl_mr = NULL;
1777 while (seg1->mr_nsegs--)
1778 rpcrdma_unmap_one(ia, seg++);
1779 if (rc)
1780 dprintk("RPC: %s: failed ib_dereg_mr,"
1781 " status %i\n", __func__, rc);
1782 return rc;
1783}
1784
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001785int
1786rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788{
1789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001790 int rc = 0;
1791
1792 switch (ia->ri_memreg_strategy) {
1793
1794#if RPCRDMA_PERSISTENT_REGISTRATION
1795 case RPCRDMA_ALLPHYSICAL:
1796 rpcrdma_map_one(ia, seg, writing);
1797 seg->mr_rkey = ia->ri_bind_mem->rkey;
1798 seg->mr_base = seg->mr_dma;
1799 seg->mr_nsegs = 1;
1800 nsegs = 1;
1801 break;
1802#endif
1803
Tom Talpey3197d3092008-10-09 15:00:20 -04001804 /* Registration using frmr registration */
1805 case RPCRDMA_FRMR:
1806 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807 break;
1808
Tom Talpey8d4ba032008-10-09 14:59:49 -04001809 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001810 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001811 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001812 break;
1813
1814 /* Registration using memory windows */
1815 case RPCRDMA_MEMWINDOWS_ASYNC:
1816 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001817 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001818 break;
1819
1820 /* Default registration each time */
1821 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001822 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001823 break;
1824 }
1825 if (rc)
1826 return -1;
1827
1828 return nsegs;
1829}
1830
1831int
1832rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833 struct rpcrdma_xprt *r_xprt, void *r)
1834{
1835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001836 int nsegs = seg->mr_nsegs, rc;
1837
1838 switch (ia->ri_memreg_strategy) {
1839
1840#if RPCRDMA_PERSISTENT_REGISTRATION
1841 case RPCRDMA_ALLPHYSICAL:
1842 BUG_ON(nsegs != 1);
1843 rpcrdma_unmap_one(ia, seg);
1844 rc = 0;
1845 break;
1846#endif
1847
Tom Talpey3197d3092008-10-09 15:00:20 -04001848 case RPCRDMA_FRMR:
1849 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850 break;
1851
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001852 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001853 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001854 break;
1855
1856 case RPCRDMA_MEMWINDOWS_ASYNC:
1857 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001858 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001859 break;
1860
1861 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001862 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001863 break;
1864 }
1865 if (r) {
1866 struct rpcrdma_rep *rep = r;
1867 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868 rep->rr_func = NULL;
1869 func(rep); /* dereg done, callback now */
1870 }
1871 return nsegs;
1872}
1873
1874/*
1875 * Prepost any receive buffer, then post send.
1876 *
1877 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1878 */
1879int
1880rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881 struct rpcrdma_ep *ep,
1882 struct rpcrdma_req *req)
1883{
1884 struct ib_send_wr send_wr, *send_wr_fail;
1885 struct rpcrdma_rep *rep = req->rl_reply;
1886 int rc;
1887
1888 if (rep) {
1889 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890 if (rc)
1891 goto out;
1892 req->rl_reply = NULL;
1893 }
1894
1895 send_wr.next = NULL;
1896 send_wr.wr_id = 0ULL; /* no send cookie */
1897 send_wr.sg_list = req->rl_send_iov;
1898 send_wr.num_sge = req->rl_niovs;
1899 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001900 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1901 ib_dma_sync_single_for_device(ia->ri_id->device,
1902 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903 DMA_TO_DEVICE);
1904 ib_dma_sync_single_for_device(ia->ri_id->device,
1905 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906 DMA_TO_DEVICE);
1907 ib_dma_sync_single_for_device(ia->ri_id->device,
1908 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909 DMA_TO_DEVICE);
1910
1911 if (DECR_CQCOUNT(ep) > 0)
1912 send_wr.send_flags = 0;
1913 else { /* Provider must take a send completion every now and then */
1914 INIT_CQCOUNT(ep);
1915 send_wr.send_flags = IB_SEND_SIGNALED;
1916 }
1917
1918 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919 if (rc)
1920 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1921 rc);
1922out:
1923 return rc;
1924}
1925
1926/*
1927 * (Re)post a receive buffer.
1928 */
1929int
1930rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931 struct rpcrdma_ep *ep,
1932 struct rpcrdma_rep *rep)
1933{
1934 struct ib_recv_wr recv_wr, *recv_wr_fail;
1935 int rc;
1936
1937 recv_wr.next = NULL;
1938 recv_wr.wr_id = (u64) (unsigned long) rep;
1939 recv_wr.sg_list = &rep->rr_iov;
1940 recv_wr.num_sge = 1;
1941
1942 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945 DECR_CQCOUNT(ep);
1946 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948 if (rc)
1949 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1950 rc);
1951 return rc;
1952}