blob: 745973b729af6af33d8a882f45f6a3ba62f62c2e [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
Alexey Dobriyana6b7a402011-06-06 10:43:46 +000050#include <linux/interrupt.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040051#include <linux/pci.h> /* for Tavor hack below */
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090052#include <linux/slab.h>
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040053
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040054#include "xprt_rdma.h"
55
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040056/*
57 * Globals/Macros
58 */
59
60#ifdef RPC_DEBUG
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
64/*
65 * internal functions
66 */
67
68/*
69 * handle replies in tasklet context, using a single, global list
70 * rdma tasklet function -- just turn around and call the func
71 * for all replies on the list
72 */
73
74static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
75static LIST_HEAD(rpcrdma_tasklets_g);
76
77static void
78rpcrdma_run_tasklet(unsigned long data)
79{
80 struct rpcrdma_rep *rep;
81 void (*func)(struct rpcrdma_rep *);
82 unsigned long flags;
83
84 data = data;
85 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
86 while (!list_empty(&rpcrdma_tasklets_g)) {
87 rep = list_entry(rpcrdma_tasklets_g.next,
88 struct rpcrdma_rep, rr_list);
89 list_del(&rep->rr_list);
90 func = rep->rr_func;
91 rep->rr_func = NULL;
92 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93
94 if (func)
95 func(rep);
96 else
97 rpcrdma_recv_buffer_put(rep);
98
99 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 }
101 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102}
103
104static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105
106static inline void
107rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108{
109 unsigned long flags;
110
111 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
112 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
113 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
114 tasklet_schedule(&rpcrdma_tasklet_g);
115}
116
117static void
118rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119{
120 struct rpcrdma_ep *ep = context;
121
122 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
123 __func__, event->event, event->device->name, context);
124 if (ep->rep_connected == 1) {
125 ep->rep_connected = -EIO;
126 ep->rep_func(ep);
127 wake_up_all(&ep->rep_connect_wait);
128 }
129}
130
131static void
132rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133{
134 struct rpcrdma_ep *ep = context;
135
136 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
137 __func__, event->event, event->device->name, context);
138 if (ep->rep_connected == 1) {
139 ep->rep_connected = -EIO;
140 ep->rep_func(ep);
141 wake_up_all(&ep->rep_connect_wait);
142 }
143}
144
145static inline
146void rpcrdma_event_process(struct ib_wc *wc)
147{
Tom Tucker5c635e02011-02-09 19:45:34 +0000148 struct rpcrdma_mw *frmr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400149 struct rpcrdma_rep *rep =
150 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151
152 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
153 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154
155 if (!rep) /* send or bind completion that we don't care about */
156 return;
157
158 if (IB_WC_SUCCESS != wc->status) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000159 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
160 __func__, wc->opcode, wc->status);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400161 rep->rr_len = ~0U;
Tom Tucker5c635e02011-02-09 19:45:34 +0000162 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
163 rpcrdma_schedule_tasklet(rep);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400164 return;
165 }
166
167 switch (wc->opcode) {
Tom Tucker5c635e02011-02-09 19:45:34 +0000168 case IB_WC_FAST_REG_MR:
169 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
170 frmr->r.frmr.state = FRMR_IS_VALID;
171 break;
172 case IB_WC_LOCAL_INV:
173 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
174 frmr->r.frmr.state = FRMR_IS_INVALID;
175 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400176 case IB_WC_RECV:
177 rep->rr_len = wc->byte_len;
178 ib_dma_sync_single_for_cpu(
179 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
180 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
181 /* Keep (only) the most recent credits, after check validity */
182 if (rep->rr_len >= 16) {
183 struct rpcrdma_msg *p =
184 (struct rpcrdma_msg *) rep->rr_base;
185 unsigned int credits = ntohl(p->rm_credit);
186 if (credits == 0) {
187 dprintk("RPC: %s: server"
188 " dropped credits to 0!\n", __func__);
189 /* don't deadlock */
190 credits = 1;
191 } else if (credits > rep->rr_buffer->rb_max_requests) {
192 dprintk("RPC: %s: server"
193 " over-crediting: %d (%d)\n",
194 __func__, credits,
195 rep->rr_buffer->rb_max_requests);
196 credits = rep->rr_buffer->rb_max_requests;
197 }
198 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 }
200 /* fall through */
201 case IB_WC_BIND_MW:
202 rpcrdma_schedule_tasklet(rep);
203 break;
204 default:
205 dprintk("RPC: %s: unexpected WC event %X\n",
206 __func__, wc->opcode);
207 break;
208 }
209}
210
211static inline int
212rpcrdma_cq_poll(struct ib_cq *cq)
213{
214 struct ib_wc wc;
215 int rc;
216
217 for (;;) {
218 rc = ib_poll_cq(cq, 1, &wc);
219 if (rc < 0) {
220 dprintk("RPC: %s: ib_poll_cq failed %i\n",
221 __func__, rc);
222 return rc;
223 }
224 if (rc == 0)
225 break;
226
227 rpcrdma_event_process(&wc);
228 }
229
230 return 0;
231}
232
233/*
234 * rpcrdma_cq_event_upcall
235 *
236 * This upcall handles recv, send, bind and unbind events.
237 * It is reentrant but processes single events in order to maintain
238 * ordering of receives to keep server credits.
239 *
240 * It is the responsibility of the scheduled tasklet to return
241 * recv buffers to the pool. NOTE: this affects synchronization of
242 * connection shutdown. That is, the structures required for
243 * the completion of the reply handler must remain intact until
244 * all memory has been reclaimed.
245 *
246 * Note that send events are suppressed and do not result in an upcall.
247 */
248static void
249rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250{
251 int rc;
252
253 rc = rpcrdma_cq_poll(cq);
254 if (rc)
255 return;
256
257 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
258 if (rc) {
259 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
260 __func__, rc);
261 return;
262 }
263
264 rpcrdma_cq_poll(cq);
265}
266
267#ifdef RPC_DEBUG
268static const char * const conn[] = {
269 "address resolved",
270 "address error",
271 "route resolved",
272 "route error",
273 "connect request",
274 "connect response",
275 "connect error",
276 "unreachable",
277 "rejected",
278 "established",
279 "disconnected",
280 "device removal"
281};
282#endif
283
284static int
285rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286{
287 struct rpcrdma_xprt *xprt = id->context;
288 struct rpcrdma_ia *ia = &xprt->rx_ia;
289 struct rpcrdma_ep *ep = &xprt->rx_ep;
Ingo Molnarff0db042008-11-25 16:58:42 -0800290#ifdef RPC_DEBUG
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400291 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
Ingo Molnarff0db042008-11-25 16:58:42 -0800292#endif
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400293 struct ib_qp_attr attr;
294 struct ib_qp_init_attr iattr;
295 int connstate = 0;
296
297 switch (event->event) {
298 case RDMA_CM_EVENT_ADDR_RESOLVED:
299 case RDMA_CM_EVENT_ROUTE_RESOLVED:
Tom Talpey5675add2008-10-09 15:01:41 -0400300 ia->ri_async_rc = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400301 complete(&ia->ri_done);
302 break;
303 case RDMA_CM_EVENT_ADDR_ERROR:
304 ia->ri_async_rc = -EHOSTUNREACH;
305 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
306 __func__, ep);
307 complete(&ia->ri_done);
308 break;
309 case RDMA_CM_EVENT_ROUTE_ERROR:
310 ia->ri_async_rc = -ENETUNREACH;
311 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
312 __func__, ep);
313 complete(&ia->ri_done);
314 break;
315 case RDMA_CM_EVENT_ESTABLISHED:
316 connstate = 1;
317 ib_query_qp(ia->ri_id->qp, &attr,
318 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
319 &iattr);
320 dprintk("RPC: %s: %d responder resources"
321 " (%d initiator)\n",
322 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
323 goto connected;
324 case RDMA_CM_EVENT_CONNECT_ERROR:
325 connstate = -ENOTCONN;
326 goto connected;
327 case RDMA_CM_EVENT_UNREACHABLE:
328 connstate = -ENETDOWN;
329 goto connected;
330 case RDMA_CM_EVENT_REJECTED:
331 connstate = -ECONNREFUSED;
332 goto connected;
333 case RDMA_CM_EVENT_DISCONNECTED:
334 connstate = -ECONNABORTED;
335 goto connected;
336 case RDMA_CM_EVENT_DEVICE_REMOVAL:
337 connstate = -ENODEV;
338connected:
Harvey Harrison21454aa2008-10-31 00:54:56 -0700339 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400340 __func__,
341 (event->event <= 11) ? conn[event->event] :
342 "unknown connection error",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700343 &addr->sin_addr.s_addr,
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400344 ntohs(addr->sin_port),
345 ep, event->event);
346 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
347 dprintk("RPC: %s: %sconnected\n",
348 __func__, connstate > 0 ? "" : "dis");
349 ep->rep_connected = connstate;
350 ep->rep_func(ep);
351 wake_up_all(&ep->rep_connect_wait);
352 break;
353 default:
Tom Talpey1a954052008-10-09 15:01:31 -0400354 dprintk("RPC: %s: unexpected CM event %d\n",
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400355 __func__, event->event);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400356 break;
357 }
358
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400359#ifdef RPC_DEBUG
360 if (connstate == 1) {
361 int ird = attr.max_dest_rd_atomic;
362 int tird = ep->rep_remote_cma.responder_resources;
Harvey Harrison21454aa2008-10-31 00:54:56 -0700363 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400364 "on %s, memreg %d slots %d ird %d%s\n",
Harvey Harrison21454aa2008-10-31 00:54:56 -0700365 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400366 ntohs(addr->sin_port),
367 ia->ri_id->device->name,
368 ia->ri_memreg_strategy,
369 xprt->rx_buf.rb_max_requests,
370 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
371 } else if (connstate < 0) {
Harvey Harrison21454aa2008-10-31 00:54:56 -0700372 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
373 &addr->sin_addr.s_addr,
Tom Talpeyb3cd8d42008-10-09 15:02:02 -0400374 ntohs(addr->sin_port),
375 connstate);
376 }
377#endif
378
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400379 return 0;
380}
381
382static struct rdma_cm_id *
383rpcrdma_create_id(struct rpcrdma_xprt *xprt,
384 struct rpcrdma_ia *ia, struct sockaddr *addr)
385{
386 struct rdma_cm_id *id;
387 int rc;
388
Tom Talpey1a954052008-10-09 15:01:31 -0400389 init_completion(&ia->ri_done);
390
Sean Heftyb26f9b92010-04-01 17:08:41 +0000391 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400392 if (IS_ERR(id)) {
393 rc = PTR_ERR(id);
394 dprintk("RPC: %s: rdma_create_id() failed %i\n",
395 __func__, rc);
396 return id;
397 }
398
Tom Talpey5675add2008-10-09 15:01:41 -0400399 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400400 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
401 if (rc) {
402 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
403 __func__, rc);
404 goto out;
405 }
Tom Talpey5675add2008-10-09 15:01:41 -0400406 wait_for_completion_interruptible_timeout(&ia->ri_done,
407 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400408 rc = ia->ri_async_rc;
409 if (rc)
410 goto out;
411
Tom Talpey5675add2008-10-09 15:01:41 -0400412 ia->ri_async_rc = -ETIMEDOUT;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400413 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
414 if (rc) {
415 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
416 __func__, rc);
417 goto out;
418 }
Tom Talpey5675add2008-10-09 15:01:41 -0400419 wait_for_completion_interruptible_timeout(&ia->ri_done,
420 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400421 rc = ia->ri_async_rc;
422 if (rc)
423 goto out;
424
425 return id;
426
427out:
428 rdma_destroy_id(id);
429 return ERR_PTR(rc);
430}
431
432/*
433 * Drain any cq, prior to teardown.
434 */
435static void
436rpcrdma_clean_cq(struct ib_cq *cq)
437{
438 struct ib_wc wc;
439 int count = 0;
440
441 while (1 == ib_poll_cq(cq, 1, &wc))
442 ++count;
443
444 if (count)
445 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
446 __func__, count, wc.opcode);
447}
448
449/*
450 * Exported functions.
451 */
452
453/*
454 * Open and initialize an Interface Adapter.
455 * o initializes fields of struct rpcrdma_ia, including
456 * interface and provider attributes and protection zone.
457 */
458int
459rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460{
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400461 int rc, mem_priv;
462 struct ib_device_attr devattr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400463 struct rpcrdma_ia *ia = &xprt->rx_ia;
464
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400465 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
466 if (IS_ERR(ia->ri_id)) {
467 rc = PTR_ERR(ia->ri_id);
468 goto out1;
469 }
470
471 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
472 if (IS_ERR(ia->ri_pd)) {
473 rc = PTR_ERR(ia->ri_pd);
474 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
475 __func__, rc);
476 goto out2;
477 }
478
479 /*
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400480 * Query the device to determine if the requested memory
481 * registration strategy is supported. If it isn't, set the
482 * strategy to a globally supported model.
483 */
484 rc = ib_query_device(ia->ri_id->device, &devattr);
485 if (rc) {
486 dprintk("RPC: %s: ib_query_device failed %d\n",
487 __func__, rc);
488 goto out2;
489 }
490
491 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
492 ia->ri_have_dma_lkey = 1;
493 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
494 }
495
496 switch (memreg) {
497 case RPCRDMA_MEMWINDOWS:
498 case RPCRDMA_MEMWINDOWS_ASYNC:
499 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
500 dprintk("RPC: %s: MEMWINDOWS registration "
501 "specified but not supported by adapter, "
502 "using slower RPCRDMA_REGISTER\n",
503 __func__);
504 memreg = RPCRDMA_REGISTER;
505 }
506 break;
507 case RPCRDMA_MTHCAFMR:
508 if (!ia->ri_id->device->alloc_fmr) {
509#if RPCRDMA_PERSISTENT_REGISTRATION
510 dprintk("RPC: %s: MTHCAFMR registration "
511 "specified but not supported by adapter, "
512 "using riskier RPCRDMA_ALLPHYSICAL\n",
513 __func__);
514 memreg = RPCRDMA_ALLPHYSICAL;
515#else
516 dprintk("RPC: %s: MTHCAFMR registration "
517 "specified but not supported by adapter, "
518 "using slower RPCRDMA_REGISTER\n",
519 __func__);
520 memreg = RPCRDMA_REGISTER;
521#endif
522 }
523 break;
Tom Talpey3197d3092008-10-09 15:00:20 -0400524 case RPCRDMA_FRMR:
525 /* Requires both frmr reg and local dma lkey */
526 if ((devattr.device_cap_flags &
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
528 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
529#if RPCRDMA_PERSISTENT_REGISTRATION
530 dprintk("RPC: %s: FRMR registration "
531 "specified but not supported by adapter, "
532 "using riskier RPCRDMA_ALLPHYSICAL\n",
533 __func__);
534 memreg = RPCRDMA_ALLPHYSICAL;
535#else
536 dprintk("RPC: %s: FRMR registration "
537 "specified but not supported by adapter, "
538 "using slower RPCRDMA_REGISTER\n",
539 __func__);
540 memreg = RPCRDMA_REGISTER;
541#endif
542 }
543 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400544 }
545
546 /*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400547 * Optionally obtain an underlying physical identity mapping in
548 * order to do a memory window-based bind. This base registration
549 * is protected from remote access - that is enabled only by binding
550 * for the specific bytes targeted during each RPC operation, and
551 * revoked after the corresponding completion similar to a storage
552 * adapter.
553 */
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400554 switch (memreg) {
555 case RPCRDMA_BOUNCEBUFFERS:
556 case RPCRDMA_REGISTER:
Tom Talpey3197d3092008-10-09 15:00:20 -0400557 case RPCRDMA_FRMR:
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400558 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400559#if RPCRDMA_PERSISTENT_REGISTRATION
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400560 case RPCRDMA_ALLPHYSICAL:
561 mem_priv = IB_ACCESS_LOCAL_WRITE |
562 IB_ACCESS_REMOTE_WRITE |
563 IB_ACCESS_REMOTE_READ;
564 goto register_setup;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400565#endif
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400566 case RPCRDMA_MEMWINDOWS_ASYNC:
567 case RPCRDMA_MEMWINDOWS:
568 mem_priv = IB_ACCESS_LOCAL_WRITE |
569 IB_ACCESS_MW_BIND;
570 goto register_setup;
571 case RPCRDMA_MTHCAFMR:
572 if (ia->ri_have_dma_lkey)
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400573 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400574 mem_priv = IB_ACCESS_LOCAL_WRITE;
575 register_setup:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400576 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
577 if (IS_ERR(ia->ri_bind_mem)) {
578 printk(KERN_ALERT "%s: ib_get_dma_mr for "
579 "phys register failed with %lX\n\t"
580 "Will continue with degraded performance\n",
581 __func__, PTR_ERR(ia->ri_bind_mem));
582 memreg = RPCRDMA_REGISTER;
583 ia->ri_bind_mem = NULL;
584 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400585 break;
586 default:
587 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
588 __func__, memreg);
589 rc = -EINVAL;
590 goto out2;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400591 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400592 dprintk("RPC: %s: memory registration strategy is %d\n",
593 __func__, memreg);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400594
595 /* Else will do memory reg/dereg for each chunk */
596 ia->ri_memreg_strategy = memreg;
597
598 return 0;
599out2:
600 rdma_destroy_id(ia->ri_id);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400601 ia->ri_id = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400602out1:
603 return rc;
604}
605
606/*
607 * Clean up/close an IA.
608 * o if event handles and PD have been initialized, free them.
609 * o close the IA
610 */
611void
612rpcrdma_ia_close(struct rpcrdma_ia *ia)
613{
614 int rc;
615
616 dprintk("RPC: %s: entering\n", __func__);
617 if (ia->ri_bind_mem != NULL) {
618 rc = ib_dereg_mr(ia->ri_bind_mem);
619 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
620 __func__, rc);
621 }
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400622 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
623 if (ia->ri_id->qp)
624 rdma_destroy_qp(ia->ri_id);
625 rdma_destroy_id(ia->ri_id);
626 ia->ri_id = NULL;
627 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400628 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
629 rc = ib_dealloc_pd(ia->ri_pd);
630 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
631 __func__, rc);
632 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400633}
634
635/*
636 * Create unconnected endpoint.
637 */
638int
639rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
640 struct rpcrdma_create_data_internal *cdata)
641{
642 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400643 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400644
645 rc = ib_query_device(ia->ri_id->device, &devattr);
646 if (rc) {
647 dprintk("RPC: %s: ib_query_device failed %d\n",
648 __func__, rc);
649 return rc;
650 }
651
652 /* check provider's send/recv wr limits */
653 if (cdata->max_requests > devattr.max_qp_wr)
654 cdata->max_requests = devattr.max_qp_wr;
655
656 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
657 ep->rep_attr.qp_context = ep;
658 /* send_cq and recv_cq initialized below */
659 ep->rep_attr.srq = NULL;
660 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
661 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400662 case RPCRDMA_FRMR:
Tom Tucker15cdc6442010-08-11 12:47:24 -0400663 /* Add room for frmr register and invalidate WRs.
664 * 1. FRMR reg WR for head
665 * 2. FRMR invalidate WR for head
666 * 3. FRMR reg WR for pagelist
667 * 4. FRMR invalidate WR for pagelist
668 * 5. FRMR reg WR for tail
669 * 6. FRMR invalidate WR for tail
670 * 7. The RDMA_SEND WR
671 */
672 ep->rep_attr.cap.max_send_wr *= 7;
673 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
674 cdata->max_requests = devattr.max_qp_wr / 7;
675 if (!cdata->max_requests)
676 return -EINVAL;
677 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
678 }
Tom Talpey3197d3092008-10-09 15:00:20 -0400679 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400680 case RPCRDMA_MEMWINDOWS_ASYNC:
681 case RPCRDMA_MEMWINDOWS:
682 /* Add room for mw_binds+unbinds - overkill! */
683 ep->rep_attr.cap.max_send_wr++;
684 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
685 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
686 return -EINVAL;
687 break;
688 default:
689 break;
690 }
691 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
692 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
693 ep->rep_attr.cap.max_recv_sge = 1;
694 ep->rep_attr.cap.max_inline_data = 0;
695 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
696 ep->rep_attr.qp_type = IB_QPT_RC;
697 ep->rep_attr.port_num = ~0;
698
699 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
700 "iovs: send %d recv %d\n",
701 __func__,
702 ep->rep_attr.cap.max_send_wr,
703 ep->rep_attr.cap.max_recv_wr,
704 ep->rep_attr.cap.max_send_sge,
705 ep->rep_attr.cap.max_recv_sge);
706
707 /* set trigger for requesting send completion */
708 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
709 switch (ia->ri_memreg_strategy) {
710 case RPCRDMA_MEMWINDOWS_ASYNC:
711 case RPCRDMA_MEMWINDOWS:
712 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
713 break;
714 default:
715 break;
716 }
717 if (ep->rep_cqinit <= 2)
718 ep->rep_cqinit = 0;
719 INIT_CQCOUNT(ep);
720 ep->rep_ia = ia;
721 init_waitqueue_head(&ep->rep_connect_wait);
722
723 /*
724 * Create a single cq for receive dto and mw_bind (only ever
725 * care about unbind, really). Send completions are suppressed.
726 * Use single threaded tasklet upcalls to maintain ordering.
727 */
728 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
729 rpcrdma_cq_async_error_upcall, NULL,
730 ep->rep_attr.cap.max_recv_wr +
731 ep->rep_attr.cap.max_send_wr + 1, 0);
732 if (IS_ERR(ep->rep_cq)) {
733 rc = PTR_ERR(ep->rep_cq);
734 dprintk("RPC: %s: ib_create_cq failed: %i\n",
735 __func__, rc);
736 goto out1;
737 }
738
739 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
740 if (rc) {
741 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
742 __func__, rc);
743 goto out2;
744 }
745
746 ep->rep_attr.send_cq = ep->rep_cq;
747 ep->rep_attr.recv_cq = ep->rep_cq;
748
749 /* Initialize cma parameters */
750
751 /* RPC/RDMA does not use private data */
752 ep->rep_remote_cma.private_data = NULL;
753 ep->rep_remote_cma.private_data_len = 0;
754
755 /* Client offers RDMA Read but does not initiate */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400756 ep->rep_remote_cma.initiator_depth = 0;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400757 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
758 ep->rep_remote_cma.responder_resources = 0;
759 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
760 ep->rep_remote_cma.responder_resources = 32;
761 else
762 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400763
764 ep->rep_remote_cma.retry_count = 7;
765 ep->rep_remote_cma.flow_control = 0;
766 ep->rep_remote_cma.rnr_retry_count = 0;
767
768 return 0;
769
770out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400771 err = ib_destroy_cq(ep->rep_cq);
772 if (err)
773 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
774 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400775out1:
776 return rc;
777}
778
779/*
780 * rpcrdma_ep_destroy
781 *
782 * Disconnect and destroy endpoint. After this, the only
783 * valid operations on the ep are to free it (if dynamically
784 * allocated) or re-create it.
785 *
786 * The caller's error handling must be sure to not leak the endpoint
787 * if this function fails.
788 */
789int
790rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
791{
792 int rc;
793
794 dprintk("RPC: %s: entering, connected is %d\n",
795 __func__, ep->rep_connected);
796
797 if (ia->ri_id->qp) {
798 rc = rpcrdma_ep_disconnect(ep, ia);
799 if (rc)
800 dprintk("RPC: %s: rpcrdma_ep_disconnect"
801 " returned %i\n", __func__, rc);
Tom Talpeyfee08ca2008-10-09 15:01:00 -0400802 rdma_destroy_qp(ia->ri_id);
803 ia->ri_id->qp = NULL;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400804 }
805
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400806 /* padding - could be done in rpcrdma_buffer_destroy... */
807 if (ep->rep_pad_mr) {
808 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
809 ep->rep_pad_mr = NULL;
810 }
811
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400812 rpcrdma_clean_cq(ep->rep_cq);
813 rc = ib_destroy_cq(ep->rep_cq);
814 if (rc)
815 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
816 __func__, rc);
817
818 return rc;
819}
820
821/*
822 * Connect unconnected endpoint.
823 */
824int
825rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
826{
827 struct rdma_cm_id *id;
828 int rc = 0;
829 int retry_count = 0;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400830
Tom Talpeyc0555512008-10-10 11:32:45 -0400831 if (ep->rep_connected != 0) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400832 struct rpcrdma_xprt *xprt;
833retry:
834 rc = rpcrdma_ep_disconnect(ep, ia);
835 if (rc && rc != -ENOTCONN)
836 dprintk("RPC: %s: rpcrdma_ep_disconnect"
837 " status %i\n", __func__, rc);
838 rpcrdma_clean_cq(ep->rep_cq);
839
840 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
841 id = rpcrdma_create_id(xprt, ia,
842 (struct sockaddr *)&xprt->rx_data.addr);
843 if (IS_ERR(id)) {
844 rc = PTR_ERR(id);
845 goto out;
846 }
847 /* TEMP TEMP TEMP - fail if new device:
848 * Deregister/remarshal *all* requests!
849 * Close and recreate adapter, pd, etc!
850 * Re-determine all attributes still sane!
851 * More stuff I haven't thought of!
852 * Rrrgh!
853 */
854 if (ia->ri_id->device != id->device) {
855 printk("RPC: %s: can't reconnect on "
856 "different device!\n", __func__);
857 rdma_destroy_id(id);
858 rc = -ENETDOWN;
859 goto out;
860 }
861 /* END TEMP */
Tom Talpey1a954052008-10-09 15:01:31 -0400862 rdma_destroy_qp(ia->ri_id);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400863 rdma_destroy_id(ia->ri_id);
864 ia->ri_id = id;
865 }
866
867 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
868 if (rc) {
869 dprintk("RPC: %s: rdma_create_qp failed %i\n",
870 __func__, rc);
871 goto out;
872 }
873
874/* XXX Tavor device performs badly with 2K MTU! */
875if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
876 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
877 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
878 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
879 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
880 struct ib_qp_attr attr = {
881 .path_mtu = IB_MTU_1024
882 };
883 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
884 }
885}
886
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400887 ep->rep_connected = 0;
888
889 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
890 if (rc) {
891 dprintk("RPC: %s: rdma_connect() failed with %i\n",
892 __func__, rc);
893 goto out;
894 }
895
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400896 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
897
898 /*
899 * Check state. A non-peer reject indicates no listener
900 * (ECONNREFUSED), which may be a transient state. All
901 * others indicate a transport condition which has already
902 * undergone a best-effort.
903 */
Joe Perchesf64f9e72009-11-29 16:55:45 -0800904 if (ep->rep_connected == -ECONNREFUSED &&
905 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400906 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
907 goto retry;
908 }
909 if (ep->rep_connected <= 0) {
910 /* Sometimes, the only way to reliably connect to remote
911 * CMs is to use same nonzero values for ORD and IRD. */
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400912 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
913 (ep->rep_remote_cma.responder_resources == 0 ||
914 ep->rep_remote_cma.initiator_depth !=
915 ep->rep_remote_cma.responder_resources)) {
916 if (ep->rep_remote_cma.responder_resources == 0)
917 ep->rep_remote_cma.responder_resources = 1;
918 ep->rep_remote_cma.initiator_depth =
919 ep->rep_remote_cma.responder_resources;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400920 goto retry;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400921 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400922 rc = ep->rep_connected;
923 } else {
924 dprintk("RPC: %s: connected\n", __func__);
925 }
926
927out:
928 if (rc)
929 ep->rep_connected = rc;
930 return rc;
931}
932
933/*
934 * rpcrdma_ep_disconnect
935 *
936 * This is separate from destroy to facilitate the ability
937 * to reconnect without recreating the endpoint.
938 *
939 * This call is not reentrant, and must not be made in parallel
940 * on the same endpoint.
941 */
942int
943rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
944{
945 int rc;
946
947 rpcrdma_clean_cq(ep->rep_cq);
948 rc = rdma_disconnect(ia->ri_id);
949 if (!rc) {
950 /* returns without wait if not connected */
951 wait_event_interruptible(ep->rep_connect_wait,
952 ep->rep_connected != 1);
953 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
954 (ep->rep_connected == 1) ? "still " : "dis");
955 } else {
956 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
957 ep->rep_connected = rc;
958 }
959 return rc;
960}
961
962/*
963 * Initialize buffer memory
964 */
965int
966rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
967 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
968{
969 char *p;
970 size_t len;
971 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400972 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400973
974 buf->rb_max_requests = cdata->max_requests;
975 spin_lock_init(&buf->rb_lock);
976 atomic_set(&buf->rb_credits, 1);
977
978 /* Need to allocate:
979 * 1. arrays for send and recv pointers
980 * 2. arrays of struct rpcrdma_req to fill in pointers
981 * 3. array of struct rpcrdma_rep for replies
982 * 4. padding, if any
Tom Talpey3197d3092008-10-09 15:00:20 -0400983 * 5. mw's, fmr's or frmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400984 * Send/recv buffers in req/rep need to be registered
985 */
986
987 len = buf->rb_max_requests *
988 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
989 len += cdata->padding;
990 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400991 case RPCRDMA_FRMR:
992 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
993 sizeof(struct rpcrdma_mw);
994 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400995 case RPCRDMA_MTHCAFMR:
996 /* TBD we are perhaps overallocating here */
997 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
998 sizeof(struct rpcrdma_mw);
999 break;
1000 case RPCRDMA_MEMWINDOWS_ASYNC:
1001 case RPCRDMA_MEMWINDOWS:
1002 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1003 sizeof(struct rpcrdma_mw);
1004 break;
1005 default:
1006 break;
1007 }
1008
1009 /* allocate 1, 4 and 5 in one shot */
1010 p = kzalloc(len, GFP_KERNEL);
1011 if (p == NULL) {
1012 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1013 __func__, len);
1014 rc = -ENOMEM;
1015 goto out;
1016 }
1017 buf->rb_pool = p; /* for freeing it later */
1018
1019 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1020 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1021 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1022 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1023
1024 /*
1025 * Register the zeroed pad buffer, if any.
1026 */
1027 if (cdata->padding) {
1028 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1029 &ep->rep_pad_mr, &ep->rep_pad);
1030 if (rc)
1031 goto out;
1032 }
1033 p += cdata->padding;
1034
1035 /*
1036 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1037 * We "cycle" the mw's in order to minimize rkey reuse,
1038 * and also reduce unbind-to-bind collision.
1039 */
1040 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001041 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001042 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001043 case RPCRDMA_FRMR:
1044 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1045 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1046 RPCRDMA_MAX_SEGS);
1047 if (IS_ERR(r->r.frmr.fr_mr)) {
1048 rc = PTR_ERR(r->r.frmr.fr_mr);
1049 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1050 " failed %i\n", __func__, rc);
1051 goto out;
1052 }
1053 r->r.frmr.fr_pgl =
1054 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1055 RPCRDMA_MAX_SEGS);
1056 if (IS_ERR(r->r.frmr.fr_pgl)) {
1057 rc = PTR_ERR(r->r.frmr.fr_pgl);
1058 dprintk("RPC: %s: "
1059 "ib_alloc_fast_reg_page_list "
1060 "failed %i\n", __func__, rc);
1061 goto out;
1062 }
1063 list_add(&r->mw_list, &buf->rb_mws);
1064 ++r;
1065 }
1066 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001067 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001068 /* TBD we are perhaps overallocating here */
1069 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001070 static struct ib_fmr_attr fa =
1071 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001072 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1073 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1074 &fa);
1075 if (IS_ERR(r->r.fmr)) {
1076 rc = PTR_ERR(r->r.fmr);
1077 dprintk("RPC: %s: ib_alloc_fmr"
1078 " failed %i\n", __func__, rc);
1079 goto out;
1080 }
1081 list_add(&r->mw_list, &buf->rb_mws);
1082 ++r;
1083 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001084 break;
1085 case RPCRDMA_MEMWINDOWS_ASYNC:
1086 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001087 /* Allocate one extra request's worth, for full cycling */
1088 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1089 r->r.mw = ib_alloc_mw(ia->ri_pd);
1090 if (IS_ERR(r->r.mw)) {
1091 rc = PTR_ERR(r->r.mw);
1092 dprintk("RPC: %s: ib_alloc_mw"
1093 " failed %i\n", __func__, rc);
1094 goto out;
1095 }
1096 list_add(&r->mw_list, &buf->rb_mws);
1097 ++r;
1098 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001099 break;
1100 default:
1101 break;
1102 }
1103
1104 /*
1105 * Allocate/init the request/reply buffers. Doing this
1106 * using kmalloc for now -- one for each buf.
1107 */
1108 for (i = 0; i < buf->rb_max_requests; i++) {
1109 struct rpcrdma_req *req;
1110 struct rpcrdma_rep *rep;
1111
1112 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1113 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1114 /* Typical ~2400b, so rounding up saves work later */
1115 if (len < 4096)
1116 len = 4096;
1117 req = kmalloc(len, GFP_KERNEL);
1118 if (req == NULL) {
1119 dprintk("RPC: %s: request buffer %d alloc"
1120 " failed\n", __func__, i);
1121 rc = -ENOMEM;
1122 goto out;
1123 }
1124 memset(req, 0, sizeof(struct rpcrdma_req));
1125 buf->rb_send_bufs[i] = req;
1126 buf->rb_send_bufs[i]->rl_buffer = buf;
1127
1128 rc = rpcrdma_register_internal(ia, req->rl_base,
1129 len - offsetof(struct rpcrdma_req, rl_base),
1130 &buf->rb_send_bufs[i]->rl_handle,
1131 &buf->rb_send_bufs[i]->rl_iov);
1132 if (rc)
1133 goto out;
1134
1135 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1136
1137 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1138 rep = kmalloc(len, GFP_KERNEL);
1139 if (rep == NULL) {
1140 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1141 __func__, i);
1142 rc = -ENOMEM;
1143 goto out;
1144 }
1145 memset(rep, 0, sizeof(struct rpcrdma_rep));
1146 buf->rb_recv_bufs[i] = rep;
1147 buf->rb_recv_bufs[i]->rr_buffer = buf;
1148 init_waitqueue_head(&rep->rr_unbind);
1149
1150 rc = rpcrdma_register_internal(ia, rep->rr_base,
1151 len - offsetof(struct rpcrdma_rep, rr_base),
1152 &buf->rb_recv_bufs[i]->rr_handle,
1153 &buf->rb_recv_bufs[i]->rr_iov);
1154 if (rc)
1155 goto out;
1156
1157 }
1158 dprintk("RPC: %s: max_requests %d\n",
1159 __func__, buf->rb_max_requests);
1160 /* done */
1161 return 0;
1162out:
1163 rpcrdma_buffer_destroy(buf);
1164 return rc;
1165}
1166
1167/*
1168 * Unregister and destroy buffer memory. Need to deal with
1169 * partial initialization, so it's callable from failed create.
1170 * Must be called before destroying endpoint, as registrations
1171 * reference it.
1172 */
1173void
1174rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1175{
1176 int rc, i;
1177 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001178 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001179
1180 /* clean up in reverse order from create
1181 * 1. recv mr memory (mr free, then kfree)
1182 * 1a. bind mw memory
1183 * 2. send mr memory (mr free, then kfree)
1184 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1185 * 4. arrays
1186 */
1187 dprintk("RPC: %s: entering\n", __func__);
1188
1189 for (i = 0; i < buf->rb_max_requests; i++) {
1190 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1191 rpcrdma_deregister_internal(ia,
1192 buf->rb_recv_bufs[i]->rr_handle,
1193 &buf->rb_recv_bufs[i]->rr_iov);
1194 kfree(buf->rb_recv_bufs[i]);
1195 }
1196 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1197 while (!list_empty(&buf->rb_mws)) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001198 r = list_entry(buf->rb_mws.next,
1199 struct rpcrdma_mw, mw_list);
1200 list_del(&r->mw_list);
1201 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001202 case RPCRDMA_FRMR:
1203 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1204 if (rc)
1205 dprintk("RPC: %s:"
1206 " ib_dereg_mr"
1207 " failed %i\n",
1208 __func__, rc);
1209 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1210 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001211 case RPCRDMA_MTHCAFMR:
1212 rc = ib_dealloc_fmr(r->r.fmr);
1213 if (rc)
1214 dprintk("RPC: %s:"
1215 " ib_dealloc_fmr"
1216 " failed %i\n",
1217 __func__, rc);
1218 break;
1219 case RPCRDMA_MEMWINDOWS_ASYNC:
1220 case RPCRDMA_MEMWINDOWS:
1221 rc = ib_dealloc_mw(r->r.mw);
1222 if (rc)
1223 dprintk("RPC: %s:"
1224 " ib_dealloc_mw"
1225 " failed %i\n",
1226 __func__, rc);
1227 break;
1228 default:
1229 break;
1230 }
1231 }
1232 rpcrdma_deregister_internal(ia,
1233 buf->rb_send_bufs[i]->rl_handle,
1234 &buf->rb_send_bufs[i]->rl_iov);
1235 kfree(buf->rb_send_bufs[i]);
1236 }
1237 }
1238
1239 kfree(buf->rb_pool);
1240}
1241
1242/*
1243 * Get a set of request/reply buffers.
1244 *
1245 * Reply buffer (if needed) is attached to send buffer upon return.
1246 * Rule:
1247 * rb_send_index and rb_recv_index MUST always be pointing to the
1248 * *next* available buffer (non-NULL). They are incremented after
1249 * removing buffers, and decremented *before* returning them.
1250 */
1251struct rpcrdma_req *
1252rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1253{
1254 struct rpcrdma_req *req;
1255 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001256 int i;
1257 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001258
1259 spin_lock_irqsave(&buffers->rb_lock, flags);
1260 if (buffers->rb_send_index == buffers->rb_max_requests) {
1261 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1262 dprintk("RPC: %s: out of request buffers\n", __func__);
1263 return ((struct rpcrdma_req *)NULL);
1264 }
1265
1266 req = buffers->rb_send_bufs[buffers->rb_send_index];
1267 if (buffers->rb_send_index < buffers->rb_recv_index) {
1268 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1269 __func__,
1270 buffers->rb_recv_index - buffers->rb_send_index);
1271 req->rl_reply = NULL;
1272 } else {
1273 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1274 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1275 }
1276 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1277 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001278 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001279 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001280 r = list_entry(buffers->rb_mws.next,
1281 struct rpcrdma_mw, mw_list);
1282 list_del(&r->mw_list);
1283 req->rl_segments[i].mr_chunk.rl_mw = r;
1284 } while (--i >= 0);
1285 }
1286 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1287 return req;
1288}
1289
1290/*
1291 * Put request/reply buffers back into pool.
1292 * Pre-decrement counter/array index.
1293 */
1294void
1295rpcrdma_buffer_put(struct rpcrdma_req *req)
1296{
1297 struct rpcrdma_buffer *buffers = req->rl_buffer;
1298 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1299 int i;
1300 unsigned long flags;
1301
1302 BUG_ON(req->rl_nchunks != 0);
1303 spin_lock_irqsave(&buffers->rb_lock, flags);
1304 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1305 req->rl_niovs = 0;
1306 if (req->rl_reply) {
1307 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1308 init_waitqueue_head(&req->rl_reply->rr_unbind);
1309 req->rl_reply->rr_func = NULL;
1310 req->rl_reply = NULL;
1311 }
1312 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001313 case RPCRDMA_FRMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001314 case RPCRDMA_MTHCAFMR:
1315 case RPCRDMA_MEMWINDOWS_ASYNC:
1316 case RPCRDMA_MEMWINDOWS:
1317 /*
1318 * Cycle mw's back in reverse order, and "spin" them.
1319 * This delays and scrambles reuse as much as possible.
1320 */
1321 i = 1;
1322 do {
1323 struct rpcrdma_mw **mw;
1324 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1325 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1326 *mw = NULL;
1327 } while (++i < RPCRDMA_MAX_SEGS);
1328 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1329 &buffers->rb_mws);
1330 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1331 break;
1332 default:
1333 break;
1334 }
1335 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1336}
1337
1338/*
1339 * Recover reply buffers from pool.
1340 * This happens when recovering from error conditions.
1341 * Post-increment counter/array index.
1342 */
1343void
1344rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1345{
1346 struct rpcrdma_buffer *buffers = req->rl_buffer;
1347 unsigned long flags;
1348
1349 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1350 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1351 spin_lock_irqsave(&buffers->rb_lock, flags);
1352 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1353 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1354 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1355 }
1356 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1357}
1358
1359/*
1360 * Put reply buffers back into pool when not attached to
1361 * request. This happens in error conditions, and when
1362 * aborting unbinds. Pre-decrement counter/array index.
1363 */
1364void
1365rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1366{
1367 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1368 unsigned long flags;
1369
1370 rep->rr_func = NULL;
1371 spin_lock_irqsave(&buffers->rb_lock, flags);
1372 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1373 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1374}
1375
1376/*
1377 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1378 */
1379
1380int
1381rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1382 struct ib_mr **mrp, struct ib_sge *iov)
1383{
1384 struct ib_phys_buf ipb;
1385 struct ib_mr *mr;
1386 int rc;
1387
1388 /*
1389 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1390 */
1391 iov->addr = ib_dma_map_single(ia->ri_id->device,
1392 va, len, DMA_BIDIRECTIONAL);
1393 iov->length = len;
1394
Tom Talpeybd7ed1d2008-10-09 15:00:09 -04001395 if (ia->ri_have_dma_lkey) {
1396 *mrp = NULL;
1397 iov->lkey = ia->ri_dma_lkey;
1398 return 0;
1399 } else if (ia->ri_bind_mem != NULL) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001400 *mrp = NULL;
1401 iov->lkey = ia->ri_bind_mem->lkey;
1402 return 0;
1403 }
1404
1405 ipb.addr = iov->addr;
1406 ipb.size = iov->length;
1407 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1408 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1409
1410 dprintk("RPC: %s: phys convert: 0x%llx "
1411 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001412 __func__, (unsigned long long)ipb.addr,
1413 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001414
1415 if (IS_ERR(mr)) {
1416 *mrp = NULL;
1417 rc = PTR_ERR(mr);
1418 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1419 } else {
1420 *mrp = mr;
1421 iov->lkey = mr->lkey;
1422 rc = 0;
1423 }
1424
1425 return rc;
1426}
1427
1428int
1429rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1430 struct ib_mr *mr, struct ib_sge *iov)
1431{
1432 int rc;
1433
1434 ib_dma_unmap_single(ia->ri_id->device,
1435 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1436
1437 if (NULL == mr)
1438 return 0;
1439
1440 rc = ib_dereg_mr(mr);
1441 if (rc)
1442 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1443 return rc;
1444}
1445
1446/*
1447 * Wrappers for chunk registration, shared by read/write chunk code.
1448 */
1449
1450static void
1451rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1452{
1453 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1454 seg->mr_dmalen = seg->mr_len;
1455 if (seg->mr_page)
1456 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1457 seg->mr_page, offset_in_page(seg->mr_offset),
1458 seg->mr_dmalen, seg->mr_dir);
1459 else
1460 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1461 seg->mr_offset,
1462 seg->mr_dmalen, seg->mr_dir);
Tom Tucker5c635e02011-02-09 19:45:34 +00001463 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1464 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1465 __func__,
Randy Dunlap986d4ab2011-03-15 17:11:59 -07001466 (unsigned long long)seg->mr_dma,
1467 seg->mr_offset, seg->mr_dmalen);
Tom Tucker5c635e02011-02-09 19:45:34 +00001468 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001469}
1470
1471static void
1472rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1473{
1474 if (seg->mr_page)
1475 ib_dma_unmap_page(ia->ri_id->device,
1476 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1477 else
1478 ib_dma_unmap_single(ia->ri_id->device,
1479 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1480}
1481
Tom Talpey8d4ba032008-10-09 14:59:49 -04001482static int
Tom Talpey3197d3092008-10-09 15:00:20 -04001483rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1484 int *nsegs, int writing, struct rpcrdma_ia *ia,
1485 struct rpcrdma_xprt *r_xprt)
1486{
1487 struct rpcrdma_mr_seg *seg1 = seg;
Tom Tucker5c635e02011-02-09 19:45:34 +00001488 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1489
Tom Talpey3197d3092008-10-09 15:00:20 -04001490 u8 key;
1491 int len, pageoff;
1492 int i, rc;
Tom Tucker9b781452012-02-20 13:07:57 -06001493 int seg_len;
1494 u64 pa;
1495 int page_no;
Tom Talpey3197d3092008-10-09 15:00:20 -04001496
1497 pageoff = offset_in_page(seg1->mr_offset);
1498 seg1->mr_offset -= pageoff; /* start of page */
1499 seg1->mr_len += pageoff;
1500 len = -pageoff;
1501 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1502 *nsegs = RPCRDMA_MAX_DATA_SEGS;
Tom Tucker9b781452012-02-20 13:07:57 -06001503 for (page_no = i = 0; i < *nsegs;) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001504 rpcrdma_map_one(ia, seg, writing);
Tom Tucker9b781452012-02-20 13:07:57 -06001505 pa = seg->mr_dma;
1506 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1507 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->
1508 page_list[page_no++] = pa;
1509 pa += PAGE_SIZE;
1510 }
Tom Talpey3197d3092008-10-09 15:00:20 -04001511 len += seg->mr_len;
1512 ++seg;
1513 ++i;
1514 /* Check for holes */
1515 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1516 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1517 break;
1518 }
1519 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1520 __func__, seg1->mr_chunk.rl_mw, i);
1521
Tom Tucker5c635e02011-02-09 19:45:34 +00001522 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1523 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1524 __func__,
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1526 /* Invalidate before using. */
1527 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1528 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1529 invalidate_wr.next = &frmr_wr;
1530 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1531 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1532 invalidate_wr.ex.invalidate_rkey =
1533 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1534 DECR_CQCOUNT(&r_xprt->rx_ep);
1535 post_wr = &invalidate_wr;
1536 } else
1537 post_wr = &frmr_wr;
1538
Tom Talpey3197d3092008-10-09 15:00:20 -04001539 /* Bump the key */
1540 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1541 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1542
1543 /* Prepare FRMR WR */
1544 memset(&frmr_wr, 0, sizeof frmr_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001545 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001546 frmr_wr.opcode = IB_WR_FAST_REG_MR;
Tom Tucker5c635e02011-02-09 19:45:34 +00001547 frmr_wr.send_flags = IB_SEND_SIGNALED;
Steve Wise7a8b80eb2010-08-11 12:47:08 -04001548 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
Tom Talpey3197d3092008-10-09 15:00:20 -04001549 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
Tom Tucker9b781452012-02-20 13:07:57 -06001550 frmr_wr.wr.fast_reg.page_list_len = page_no;
Tom Talpey3197d3092008-10-09 15:00:20 -04001551 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
Tom Tucker9b781452012-02-20 13:07:57 -06001552 frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
Tom Tucker5c635e02011-02-09 19:45:34 +00001553 BUG_ON(frmr_wr.wr.fast_reg.length < len);
Tom Talpey3197d3092008-10-09 15:00:20 -04001554 frmr_wr.wr.fast_reg.access_flags = (writing ?
Vu Pham68743082009-05-26 14:51:00 -04001555 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1556 IB_ACCESS_REMOTE_READ);
Tom Talpey3197d3092008-10-09 15:00:20 -04001557 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1558 DECR_CQCOUNT(&r_xprt->rx_ep);
1559
Tom Tucker5c635e02011-02-09 19:45:34 +00001560 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
Tom Talpey3197d3092008-10-09 15:00:20 -04001561
1562 if (rc) {
1563 dprintk("RPC: %s: failed ib_post_send for register,"
1564 " status %i\n", __func__, rc);
1565 while (i--)
1566 rpcrdma_unmap_one(ia, --seg);
1567 } else {
1568 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1569 seg1->mr_base = seg1->mr_dma + pageoff;
1570 seg1->mr_nsegs = i;
1571 seg1->mr_len = len;
1572 }
1573 *nsegs = i;
1574 return rc;
1575}
1576
1577static int
1578rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1579 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1580{
1581 struct rpcrdma_mr_seg *seg1 = seg;
1582 struct ib_send_wr invalidate_wr, *bad_wr;
1583 int rc;
1584
1585 while (seg1->mr_nsegs--)
1586 rpcrdma_unmap_one(ia, seg++);
1587
1588 memset(&invalidate_wr, 0, sizeof invalidate_wr);
Tom Tucker5c635e02011-02-09 19:45:34 +00001589 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
Tom Talpey3197d3092008-10-09 15:00:20 -04001590 invalidate_wr.opcode = IB_WR_LOCAL_INV;
Tom Tucker5c635e02011-02-09 19:45:34 +00001591 invalidate_wr.send_flags = IB_SEND_SIGNALED;
Tom Talpey3197d3092008-10-09 15:00:20 -04001592 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1593 DECR_CQCOUNT(&r_xprt->rx_ep);
1594
1595 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1596 if (rc)
1597 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1598 " status %i\n", __func__, rc);
1599 return rc;
1600}
1601
1602static int
Tom Talpey8d4ba032008-10-09 14:59:49 -04001603rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1604 int *nsegs, int writing, struct rpcrdma_ia *ia)
1605{
1606 struct rpcrdma_mr_seg *seg1 = seg;
1607 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1608 int len, pageoff, i, rc;
1609
1610 pageoff = offset_in_page(seg1->mr_offset);
1611 seg1->mr_offset -= pageoff; /* start of page */
1612 seg1->mr_len += pageoff;
1613 len = -pageoff;
1614 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1615 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1616 for (i = 0; i < *nsegs;) {
1617 rpcrdma_map_one(ia, seg, writing);
1618 physaddrs[i] = seg->mr_dma;
1619 len += seg->mr_len;
1620 ++seg;
1621 ++i;
1622 /* Check for holes */
1623 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1624 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1625 break;
1626 }
1627 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1628 physaddrs, i, seg1->mr_dma);
1629 if (rc) {
1630 dprintk("RPC: %s: failed ib_map_phys_fmr "
1631 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1632 len, (unsigned long long)seg1->mr_dma,
1633 pageoff, i, rc);
1634 while (i--)
1635 rpcrdma_unmap_one(ia, --seg);
1636 } else {
1637 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1638 seg1->mr_base = seg1->mr_dma + pageoff;
1639 seg1->mr_nsegs = i;
1640 seg1->mr_len = len;
1641 }
1642 *nsegs = i;
1643 return rc;
1644}
1645
1646static int
1647rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1648 struct rpcrdma_ia *ia)
1649{
1650 struct rpcrdma_mr_seg *seg1 = seg;
1651 LIST_HEAD(l);
1652 int rc;
1653
1654 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1655 rc = ib_unmap_fmr(&l);
1656 while (seg1->mr_nsegs--)
1657 rpcrdma_unmap_one(ia, seg++);
1658 if (rc)
1659 dprintk("RPC: %s: failed ib_unmap_fmr,"
1660 " status %i\n", __func__, rc);
1661 return rc;
1662}
1663
1664static int
1665rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1666 int *nsegs, int writing, struct rpcrdma_ia *ia,
1667 struct rpcrdma_xprt *r_xprt)
1668{
1669 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1670 IB_ACCESS_REMOTE_READ);
1671 struct ib_mw_bind param;
1672 int rc;
1673
1674 *nsegs = 1;
1675 rpcrdma_map_one(ia, seg, writing);
1676 param.mr = ia->ri_bind_mem;
1677 param.wr_id = 0ULL; /* no send cookie */
1678 param.addr = seg->mr_dma;
1679 param.length = seg->mr_len;
1680 param.send_flags = 0;
1681 param.mw_access_flags = mem_priv;
1682
1683 DECR_CQCOUNT(&r_xprt->rx_ep);
1684 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1685 if (rc) {
1686 dprintk("RPC: %s: failed ib_bind_mw "
1687 "%u@0x%llx status %i\n",
1688 __func__, seg->mr_len,
1689 (unsigned long long)seg->mr_dma, rc);
1690 rpcrdma_unmap_one(ia, seg);
1691 } else {
1692 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1693 seg->mr_base = param.addr;
1694 seg->mr_nsegs = 1;
1695 }
1696 return rc;
1697}
1698
1699static int
1700rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1701 struct rpcrdma_ia *ia,
1702 struct rpcrdma_xprt *r_xprt, void **r)
1703{
1704 struct ib_mw_bind param;
1705 LIST_HEAD(l);
1706 int rc;
1707
1708 BUG_ON(seg->mr_nsegs != 1);
1709 param.mr = ia->ri_bind_mem;
1710 param.addr = 0ULL; /* unbind */
1711 param.length = 0;
1712 param.mw_access_flags = 0;
1713 if (*r) {
1714 param.wr_id = (u64) (unsigned long) *r;
1715 param.send_flags = IB_SEND_SIGNALED;
1716 INIT_CQCOUNT(&r_xprt->rx_ep);
1717 } else {
1718 param.wr_id = 0ULL;
1719 param.send_flags = 0;
1720 DECR_CQCOUNT(&r_xprt->rx_ep);
1721 }
1722 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1723 rpcrdma_unmap_one(ia, seg);
1724 if (rc)
1725 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1726 " status %i\n", __func__, rc);
1727 else
1728 *r = NULL; /* will upcall on completion */
1729 return rc;
1730}
1731
1732static int
1733rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1734 int *nsegs, int writing, struct rpcrdma_ia *ia)
1735{
1736 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1737 IB_ACCESS_REMOTE_READ);
1738 struct rpcrdma_mr_seg *seg1 = seg;
1739 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1740 int len, i, rc = 0;
1741
1742 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1743 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1744 for (len = 0, i = 0; i < *nsegs;) {
1745 rpcrdma_map_one(ia, seg, writing);
1746 ipb[i].addr = seg->mr_dma;
1747 ipb[i].size = seg->mr_len;
1748 len += seg->mr_len;
1749 ++seg;
1750 ++i;
1751 /* Check for holes */
1752 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1753 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1754 break;
1755 }
1756 seg1->mr_base = seg1->mr_dma;
1757 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1758 ipb, i, mem_priv, &seg1->mr_base);
1759 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1760 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1761 dprintk("RPC: %s: failed ib_reg_phys_mr "
1762 "%u@0x%llx (%d)... status %i\n",
1763 __func__, len,
1764 (unsigned long long)seg1->mr_dma, i, rc);
1765 while (i--)
1766 rpcrdma_unmap_one(ia, --seg);
1767 } else {
1768 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1769 seg1->mr_nsegs = i;
1770 seg1->mr_len = len;
1771 }
1772 *nsegs = i;
1773 return rc;
1774}
1775
1776static int
1777rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1778 struct rpcrdma_ia *ia)
1779{
1780 struct rpcrdma_mr_seg *seg1 = seg;
1781 int rc;
1782
1783 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1784 seg1->mr_chunk.rl_mr = NULL;
1785 while (seg1->mr_nsegs--)
1786 rpcrdma_unmap_one(ia, seg++);
1787 if (rc)
1788 dprintk("RPC: %s: failed ib_dereg_mr,"
1789 " status %i\n", __func__, rc);
1790 return rc;
1791}
1792
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001793int
1794rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1795 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1796{
1797 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001798 int rc = 0;
1799
1800 switch (ia->ri_memreg_strategy) {
1801
1802#if RPCRDMA_PERSISTENT_REGISTRATION
1803 case RPCRDMA_ALLPHYSICAL:
1804 rpcrdma_map_one(ia, seg, writing);
1805 seg->mr_rkey = ia->ri_bind_mem->rkey;
1806 seg->mr_base = seg->mr_dma;
1807 seg->mr_nsegs = 1;
1808 nsegs = 1;
1809 break;
1810#endif
1811
Tom Talpey3197d3092008-10-09 15:00:20 -04001812 /* Registration using frmr registration */
1813 case RPCRDMA_FRMR:
1814 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1815 break;
1816
Tom Talpey8d4ba032008-10-09 14:59:49 -04001817 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001818 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001819 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001820 break;
1821
1822 /* Registration using memory windows */
1823 case RPCRDMA_MEMWINDOWS_ASYNC:
1824 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001825 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001826 break;
1827
1828 /* Default registration each time */
1829 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001830 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001831 break;
1832 }
1833 if (rc)
1834 return -1;
1835
1836 return nsegs;
1837}
1838
1839int
1840rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1841 struct rpcrdma_xprt *r_xprt, void *r)
1842{
1843 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001844 int nsegs = seg->mr_nsegs, rc;
1845
1846 switch (ia->ri_memreg_strategy) {
1847
1848#if RPCRDMA_PERSISTENT_REGISTRATION
1849 case RPCRDMA_ALLPHYSICAL:
1850 BUG_ON(nsegs != 1);
1851 rpcrdma_unmap_one(ia, seg);
1852 rc = 0;
1853 break;
1854#endif
1855
Tom Talpey3197d3092008-10-09 15:00:20 -04001856 case RPCRDMA_FRMR:
1857 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1858 break;
1859
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001860 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001861 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001862 break;
1863
1864 case RPCRDMA_MEMWINDOWS_ASYNC:
1865 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001866 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001867 break;
1868
1869 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001870 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001871 break;
1872 }
1873 if (r) {
1874 struct rpcrdma_rep *rep = r;
1875 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1876 rep->rr_func = NULL;
1877 func(rep); /* dereg done, callback now */
1878 }
1879 return nsegs;
1880}
1881
1882/*
1883 * Prepost any receive buffer, then post send.
1884 *
1885 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1886 */
1887int
1888rpcrdma_ep_post(struct rpcrdma_ia *ia,
1889 struct rpcrdma_ep *ep,
1890 struct rpcrdma_req *req)
1891{
1892 struct ib_send_wr send_wr, *send_wr_fail;
1893 struct rpcrdma_rep *rep = req->rl_reply;
1894 int rc;
1895
1896 if (rep) {
1897 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1898 if (rc)
1899 goto out;
1900 req->rl_reply = NULL;
1901 }
1902
1903 send_wr.next = NULL;
1904 send_wr.wr_id = 0ULL; /* no send cookie */
1905 send_wr.sg_list = req->rl_send_iov;
1906 send_wr.num_sge = req->rl_niovs;
1907 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001908 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1909 ib_dma_sync_single_for_device(ia->ri_id->device,
1910 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1911 DMA_TO_DEVICE);
1912 ib_dma_sync_single_for_device(ia->ri_id->device,
1913 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1914 DMA_TO_DEVICE);
1915 ib_dma_sync_single_for_device(ia->ri_id->device,
1916 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1917 DMA_TO_DEVICE);
1918
1919 if (DECR_CQCOUNT(ep) > 0)
1920 send_wr.send_flags = 0;
1921 else { /* Provider must take a send completion every now and then */
1922 INIT_CQCOUNT(ep);
1923 send_wr.send_flags = IB_SEND_SIGNALED;
1924 }
1925
1926 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1927 if (rc)
1928 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1929 rc);
1930out:
1931 return rc;
1932}
1933
1934/*
1935 * (Re)post a receive buffer.
1936 */
1937int
1938rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1939 struct rpcrdma_ep *ep,
1940 struct rpcrdma_rep *rep)
1941{
1942 struct ib_recv_wr recv_wr, *recv_wr_fail;
1943 int rc;
1944
1945 recv_wr.next = NULL;
1946 recv_wr.wr_id = (u64) (unsigned long) rep;
1947 recv_wr.sg_list = &rep->rr_iov;
1948 recv_wr.num_sge = 1;
1949
1950 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1951 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1952
1953 DECR_CQCOUNT(ep);
1954 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1955
1956 if (rc)
1957 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1958 rc);
1959 return rc;
1960}