blob: e3fe9054fef691ef4338dfa07842832a04e986cc [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040052#include "xprt_rdma.h"
53
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040054/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
346 }
347
348 return 0;
349}
350
351static struct rdma_cm_id *
352rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
354{
355 struct rdma_cm_id *id;
356 int rc;
357
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
364 }
365
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
372 }
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
377
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
384 }
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
389
390 return id;
391
392out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
395}
396
397/*
398 * Drain any cq, prior to teardown.
399 */
400static void
401rpcrdma_clean_cq(struct ib_cq *cq)
402{
403 struct ib_wc wc;
404 int count = 0;
405
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
408
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
412}
413
414/*
415 * Exported functions.
416 */
417
418/*
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
422 */
423int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400426 int rc, mem_priv;
427 struct ib_device_attr devattr;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400428 struct rpcrdma_ia *ia = &xprt->rx_ia;
429
430 init_completion(&ia->ri_done);
431
432 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
433 if (IS_ERR(ia->ri_id)) {
434 rc = PTR_ERR(ia->ri_id);
435 goto out1;
436 }
437
438 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
439 if (IS_ERR(ia->ri_pd)) {
440 rc = PTR_ERR(ia->ri_pd);
441 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
442 __func__, rc);
443 goto out2;
444 }
445
446 /*
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400447 * Query the device to determine if the requested memory
448 * registration strategy is supported. If it isn't, set the
449 * strategy to a globally supported model.
450 */
451 rc = ib_query_device(ia->ri_id->device, &devattr);
452 if (rc) {
453 dprintk("RPC: %s: ib_query_device failed %d\n",
454 __func__, rc);
455 goto out2;
456 }
457
458 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
459 ia->ri_have_dma_lkey = 1;
460 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
461 }
462
463 switch (memreg) {
464 case RPCRDMA_MEMWINDOWS:
465 case RPCRDMA_MEMWINDOWS_ASYNC:
466 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
467 dprintk("RPC: %s: MEMWINDOWS registration "
468 "specified but not supported by adapter, "
469 "using slower RPCRDMA_REGISTER\n",
470 __func__);
471 memreg = RPCRDMA_REGISTER;
472 }
473 break;
474 case RPCRDMA_MTHCAFMR:
475 if (!ia->ri_id->device->alloc_fmr) {
476#if RPCRDMA_PERSISTENT_REGISTRATION
477 dprintk("RPC: %s: MTHCAFMR registration "
478 "specified but not supported by adapter, "
479 "using riskier RPCRDMA_ALLPHYSICAL\n",
480 __func__);
481 memreg = RPCRDMA_ALLPHYSICAL;
482#else
483 dprintk("RPC: %s: MTHCAFMR registration "
484 "specified but not supported by adapter, "
485 "using slower RPCRDMA_REGISTER\n",
486 __func__);
487 memreg = RPCRDMA_REGISTER;
488#endif
489 }
490 break;
Tom Talpey3197d3092008-10-09 15:00:20 -0400491 case RPCRDMA_FRMR:
492 /* Requires both frmr reg and local dma lkey */
493 if ((devattr.device_cap_flags &
494 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
495 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
496#if RPCRDMA_PERSISTENT_REGISTRATION
497 dprintk("RPC: %s: FRMR registration "
498 "specified but not supported by adapter, "
499 "using riskier RPCRDMA_ALLPHYSICAL\n",
500 __func__);
501 memreg = RPCRDMA_ALLPHYSICAL;
502#else
503 dprintk("RPC: %s: FRMR registration "
504 "specified but not supported by adapter, "
505 "using slower RPCRDMA_REGISTER\n",
506 __func__);
507 memreg = RPCRDMA_REGISTER;
508#endif
509 }
510 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400511 }
512
513 /*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400514 * Optionally obtain an underlying physical identity mapping in
515 * order to do a memory window-based bind. This base registration
516 * is protected from remote access - that is enabled only by binding
517 * for the specific bytes targeted during each RPC operation, and
518 * revoked after the corresponding completion similar to a storage
519 * adapter.
520 */
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400521 switch (memreg) {
522 case RPCRDMA_BOUNCEBUFFERS:
523 case RPCRDMA_REGISTER:
Tom Talpey3197d3092008-10-09 15:00:20 -0400524 case RPCRDMA_FRMR:
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400525 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400526#if RPCRDMA_PERSISTENT_REGISTRATION
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400527 case RPCRDMA_ALLPHYSICAL:
528 mem_priv = IB_ACCESS_LOCAL_WRITE |
529 IB_ACCESS_REMOTE_WRITE |
530 IB_ACCESS_REMOTE_READ;
531 goto register_setup;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400532#endif
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400533 case RPCRDMA_MEMWINDOWS_ASYNC:
534 case RPCRDMA_MEMWINDOWS:
535 mem_priv = IB_ACCESS_LOCAL_WRITE |
536 IB_ACCESS_MW_BIND;
537 goto register_setup;
538 case RPCRDMA_MTHCAFMR:
539 if (ia->ri_have_dma_lkey)
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400540 break;
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400541 mem_priv = IB_ACCESS_LOCAL_WRITE;
542 register_setup:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400543 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
544 if (IS_ERR(ia->ri_bind_mem)) {
545 printk(KERN_ALERT "%s: ib_get_dma_mr for "
546 "phys register failed with %lX\n\t"
547 "Will continue with degraded performance\n",
548 __func__, PTR_ERR(ia->ri_bind_mem));
549 memreg = RPCRDMA_REGISTER;
550 ia->ri_bind_mem = NULL;
551 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400552 break;
553 default:
554 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
555 __func__, memreg);
556 rc = -EINVAL;
557 goto out2;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400558 }
Tom Talpeybd7ed1d2008-10-09 15:00:09 -0400559 dprintk("RPC: %s: memory registration strategy is %d\n",
560 __func__, memreg);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400561
562 /* Else will do memory reg/dereg for each chunk */
563 ia->ri_memreg_strategy = memreg;
564
565 return 0;
566out2:
567 rdma_destroy_id(ia->ri_id);
568out1:
569 return rc;
570}
571
572/*
573 * Clean up/close an IA.
574 * o if event handles and PD have been initialized, free them.
575 * o close the IA
576 */
577void
578rpcrdma_ia_close(struct rpcrdma_ia *ia)
579{
580 int rc;
581
582 dprintk("RPC: %s: entering\n", __func__);
583 if (ia->ri_bind_mem != NULL) {
584 rc = ib_dereg_mr(ia->ri_bind_mem);
585 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
586 __func__, rc);
587 }
588 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
589 rdma_destroy_qp(ia->ri_id);
590 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
591 rc = ib_dealloc_pd(ia->ri_pd);
592 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
593 __func__, rc);
594 }
595 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
596 rdma_destroy_id(ia->ri_id);
597}
598
599/*
600 * Create unconnected endpoint.
601 */
602int
603rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
604 struct rpcrdma_create_data_internal *cdata)
605{
606 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400607 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400608
609 rc = ib_query_device(ia->ri_id->device, &devattr);
610 if (rc) {
611 dprintk("RPC: %s: ib_query_device failed %d\n",
612 __func__, rc);
613 return rc;
614 }
615
616 /* check provider's send/recv wr limits */
617 if (cdata->max_requests > devattr.max_qp_wr)
618 cdata->max_requests = devattr.max_qp_wr;
619
620 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
621 ep->rep_attr.qp_context = ep;
622 /* send_cq and recv_cq initialized below */
623 ep->rep_attr.srq = NULL;
624 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
625 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400626 case RPCRDMA_FRMR:
627 /* Add room for frmr register and invalidate WRs */
628 ep->rep_attr.cap.max_send_wr *= 3;
629 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
630 return -EINVAL;
631 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400632 case RPCRDMA_MEMWINDOWS_ASYNC:
633 case RPCRDMA_MEMWINDOWS:
634 /* Add room for mw_binds+unbinds - overkill! */
635 ep->rep_attr.cap.max_send_wr++;
636 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
637 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
638 return -EINVAL;
639 break;
640 default:
641 break;
642 }
643 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
644 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
645 ep->rep_attr.cap.max_recv_sge = 1;
646 ep->rep_attr.cap.max_inline_data = 0;
647 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
648 ep->rep_attr.qp_type = IB_QPT_RC;
649 ep->rep_attr.port_num = ~0;
650
651 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
652 "iovs: send %d recv %d\n",
653 __func__,
654 ep->rep_attr.cap.max_send_wr,
655 ep->rep_attr.cap.max_recv_wr,
656 ep->rep_attr.cap.max_send_sge,
657 ep->rep_attr.cap.max_recv_sge);
658
659 /* set trigger for requesting send completion */
660 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
661 switch (ia->ri_memreg_strategy) {
662 case RPCRDMA_MEMWINDOWS_ASYNC:
663 case RPCRDMA_MEMWINDOWS:
664 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
665 break;
666 default:
667 break;
668 }
669 if (ep->rep_cqinit <= 2)
670 ep->rep_cqinit = 0;
671 INIT_CQCOUNT(ep);
672 ep->rep_ia = ia;
673 init_waitqueue_head(&ep->rep_connect_wait);
674
675 /*
676 * Create a single cq for receive dto and mw_bind (only ever
677 * care about unbind, really). Send completions are suppressed.
678 * Use single threaded tasklet upcalls to maintain ordering.
679 */
680 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
681 rpcrdma_cq_async_error_upcall, NULL,
682 ep->rep_attr.cap.max_recv_wr +
683 ep->rep_attr.cap.max_send_wr + 1, 0);
684 if (IS_ERR(ep->rep_cq)) {
685 rc = PTR_ERR(ep->rep_cq);
686 dprintk("RPC: %s: ib_create_cq failed: %i\n",
687 __func__, rc);
688 goto out1;
689 }
690
691 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
692 if (rc) {
693 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
694 __func__, rc);
695 goto out2;
696 }
697
698 ep->rep_attr.send_cq = ep->rep_cq;
699 ep->rep_attr.recv_cq = ep->rep_cq;
700
701 /* Initialize cma parameters */
702
703 /* RPC/RDMA does not use private data */
704 ep->rep_remote_cma.private_data = NULL;
705 ep->rep_remote_cma.private_data_len = 0;
706
707 /* Client offers RDMA Read but does not initiate */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400708 ep->rep_remote_cma.initiator_depth = 0;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400709 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
710 ep->rep_remote_cma.responder_resources = 0;
711 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
712 ep->rep_remote_cma.responder_resources = 32;
713 else
714 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400715
716 ep->rep_remote_cma.retry_count = 7;
717 ep->rep_remote_cma.flow_control = 0;
718 ep->rep_remote_cma.rnr_retry_count = 0;
719
720 return 0;
721
722out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400723 err = ib_destroy_cq(ep->rep_cq);
724 if (err)
725 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
726 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400727out1:
728 return rc;
729}
730
731/*
732 * rpcrdma_ep_destroy
733 *
734 * Disconnect and destroy endpoint. After this, the only
735 * valid operations on the ep are to free it (if dynamically
736 * allocated) or re-create it.
737 *
738 * The caller's error handling must be sure to not leak the endpoint
739 * if this function fails.
740 */
741int
742rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
743{
744 int rc;
745
746 dprintk("RPC: %s: entering, connected is %d\n",
747 __func__, ep->rep_connected);
748
749 if (ia->ri_id->qp) {
750 rc = rpcrdma_ep_disconnect(ep, ia);
751 if (rc)
752 dprintk("RPC: %s: rpcrdma_ep_disconnect"
753 " returned %i\n", __func__, rc);
754 }
755
756 ep->rep_func = NULL;
757
758 /* padding - could be done in rpcrdma_buffer_destroy... */
759 if (ep->rep_pad_mr) {
760 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
761 ep->rep_pad_mr = NULL;
762 }
763
764 if (ia->ri_id->qp) {
765 rdma_destroy_qp(ia->ri_id);
766 ia->ri_id->qp = NULL;
767 }
768
769 rpcrdma_clean_cq(ep->rep_cq);
770 rc = ib_destroy_cq(ep->rep_cq);
771 if (rc)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, rc);
774
775 return rc;
776}
777
778/*
779 * Connect unconnected endpoint.
780 */
781int
782rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
783{
784 struct rdma_cm_id *id;
785 int rc = 0;
786 int retry_count = 0;
787 int reconnect = (ep->rep_connected != 0);
788
789 if (reconnect) {
790 struct rpcrdma_xprt *xprt;
791retry:
792 rc = rpcrdma_ep_disconnect(ep, ia);
793 if (rc && rc != -ENOTCONN)
794 dprintk("RPC: %s: rpcrdma_ep_disconnect"
795 " status %i\n", __func__, rc);
796 rpcrdma_clean_cq(ep->rep_cq);
797
798 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
799 id = rpcrdma_create_id(xprt, ia,
800 (struct sockaddr *)&xprt->rx_data.addr);
801 if (IS_ERR(id)) {
802 rc = PTR_ERR(id);
803 goto out;
804 }
805 /* TEMP TEMP TEMP - fail if new device:
806 * Deregister/remarshal *all* requests!
807 * Close and recreate adapter, pd, etc!
808 * Re-determine all attributes still sane!
809 * More stuff I haven't thought of!
810 * Rrrgh!
811 */
812 if (ia->ri_id->device != id->device) {
813 printk("RPC: %s: can't reconnect on "
814 "different device!\n", __func__);
815 rdma_destroy_id(id);
816 rc = -ENETDOWN;
817 goto out;
818 }
819 /* END TEMP */
820 rdma_destroy_id(ia->ri_id);
821 ia->ri_id = id;
822 }
823
824 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
825 if (rc) {
826 dprintk("RPC: %s: rdma_create_qp failed %i\n",
827 __func__, rc);
828 goto out;
829 }
830
831/* XXX Tavor device performs badly with 2K MTU! */
832if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
833 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
834 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
835 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
836 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
837 struct ib_qp_attr attr = {
838 .path_mtu = IB_MTU_1024
839 };
840 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
841 }
842}
843
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400844 ep->rep_connected = 0;
845
846 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
847 if (rc) {
848 dprintk("RPC: %s: rdma_connect() failed with %i\n",
849 __func__, rc);
850 goto out;
851 }
852
853 if (reconnect)
854 return 0;
855
856 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
857
858 /*
859 * Check state. A non-peer reject indicates no listener
860 * (ECONNREFUSED), which may be a transient state. All
861 * others indicate a transport condition which has already
862 * undergone a best-effort.
863 */
864 if (ep->rep_connected == -ECONNREFUSED
865 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
866 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
867 goto retry;
868 }
869 if (ep->rep_connected <= 0) {
870 /* Sometimes, the only way to reliably connect to remote
871 * CMs is to use same nonzero values for ORD and IRD. */
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400872 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
873 (ep->rep_remote_cma.responder_resources == 0 ||
874 ep->rep_remote_cma.initiator_depth !=
875 ep->rep_remote_cma.responder_resources)) {
876 if (ep->rep_remote_cma.responder_resources == 0)
877 ep->rep_remote_cma.responder_resources = 1;
878 ep->rep_remote_cma.initiator_depth =
879 ep->rep_remote_cma.responder_resources;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400880 goto retry;
Tom Tuckerb334eaa2008-10-09 15:00:30 -0400881 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400882 rc = ep->rep_connected;
883 } else {
884 dprintk("RPC: %s: connected\n", __func__);
885 }
886
887out:
888 if (rc)
889 ep->rep_connected = rc;
890 return rc;
891}
892
893/*
894 * rpcrdma_ep_disconnect
895 *
896 * This is separate from destroy to facilitate the ability
897 * to reconnect without recreating the endpoint.
898 *
899 * This call is not reentrant, and must not be made in parallel
900 * on the same endpoint.
901 */
902int
903rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
904{
905 int rc;
906
907 rpcrdma_clean_cq(ep->rep_cq);
908 rc = rdma_disconnect(ia->ri_id);
909 if (!rc) {
910 /* returns without wait if not connected */
911 wait_event_interruptible(ep->rep_connect_wait,
912 ep->rep_connected != 1);
913 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
914 (ep->rep_connected == 1) ? "still " : "dis");
915 } else {
916 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
917 ep->rep_connected = rc;
918 }
919 return rc;
920}
921
922/*
923 * Initialize buffer memory
924 */
925int
926rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
927 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
928{
929 char *p;
930 size_t len;
931 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400932 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400933
934 buf->rb_max_requests = cdata->max_requests;
935 spin_lock_init(&buf->rb_lock);
936 atomic_set(&buf->rb_credits, 1);
937
938 /* Need to allocate:
939 * 1. arrays for send and recv pointers
940 * 2. arrays of struct rpcrdma_req to fill in pointers
941 * 3. array of struct rpcrdma_rep for replies
942 * 4. padding, if any
Tom Talpey3197d3092008-10-09 15:00:20 -0400943 * 5. mw's, fmr's or frmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400944 * Send/recv buffers in req/rep need to be registered
945 */
946
947 len = buf->rb_max_requests *
948 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
949 len += cdata->padding;
950 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -0400951 case RPCRDMA_FRMR:
952 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
953 sizeof(struct rpcrdma_mw);
954 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400955 case RPCRDMA_MTHCAFMR:
956 /* TBD we are perhaps overallocating here */
957 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
958 sizeof(struct rpcrdma_mw);
959 break;
960 case RPCRDMA_MEMWINDOWS_ASYNC:
961 case RPCRDMA_MEMWINDOWS:
962 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
963 sizeof(struct rpcrdma_mw);
964 break;
965 default:
966 break;
967 }
968
969 /* allocate 1, 4 and 5 in one shot */
970 p = kzalloc(len, GFP_KERNEL);
971 if (p == NULL) {
972 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
973 __func__, len);
974 rc = -ENOMEM;
975 goto out;
976 }
977 buf->rb_pool = p; /* for freeing it later */
978
979 buf->rb_send_bufs = (struct rpcrdma_req **) p;
980 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
981 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
982 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
983
984 /*
985 * Register the zeroed pad buffer, if any.
986 */
987 if (cdata->padding) {
988 rc = rpcrdma_register_internal(ia, p, cdata->padding,
989 &ep->rep_pad_mr, &ep->rep_pad);
990 if (rc)
991 goto out;
992 }
993 p += cdata->padding;
994
995 /*
996 * Allocate the fmr's, or mw's for mw_bind chunk registration.
997 * We "cycle" the mw's in order to minimize rkey reuse,
998 * and also reduce unbind-to-bind collision.
999 */
1000 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001001 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001002 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001003 case RPCRDMA_FRMR:
1004 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1005 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1006 RPCRDMA_MAX_SEGS);
1007 if (IS_ERR(r->r.frmr.fr_mr)) {
1008 rc = PTR_ERR(r->r.frmr.fr_mr);
1009 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1010 " failed %i\n", __func__, rc);
1011 goto out;
1012 }
1013 r->r.frmr.fr_pgl =
1014 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1015 RPCRDMA_MAX_SEGS);
1016 if (IS_ERR(r->r.frmr.fr_pgl)) {
1017 rc = PTR_ERR(r->r.frmr.fr_pgl);
1018 dprintk("RPC: %s: "
1019 "ib_alloc_fast_reg_page_list "
1020 "failed %i\n", __func__, rc);
1021 goto out;
1022 }
1023 list_add(&r->mw_list, &buf->rb_mws);
1024 ++r;
1025 }
1026 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001027 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001028 /* TBD we are perhaps overallocating here */
1029 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001030 static struct ib_fmr_attr fa =
1031 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001032 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1033 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1034 &fa);
1035 if (IS_ERR(r->r.fmr)) {
1036 rc = PTR_ERR(r->r.fmr);
1037 dprintk("RPC: %s: ib_alloc_fmr"
1038 " failed %i\n", __func__, rc);
1039 goto out;
1040 }
1041 list_add(&r->mw_list, &buf->rb_mws);
1042 ++r;
1043 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001044 break;
1045 case RPCRDMA_MEMWINDOWS_ASYNC:
1046 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001047 /* Allocate one extra request's worth, for full cycling */
1048 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1049 r->r.mw = ib_alloc_mw(ia->ri_pd);
1050 if (IS_ERR(r->r.mw)) {
1051 rc = PTR_ERR(r->r.mw);
1052 dprintk("RPC: %s: ib_alloc_mw"
1053 " failed %i\n", __func__, rc);
1054 goto out;
1055 }
1056 list_add(&r->mw_list, &buf->rb_mws);
1057 ++r;
1058 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001059 break;
1060 default:
1061 break;
1062 }
1063
1064 /*
1065 * Allocate/init the request/reply buffers. Doing this
1066 * using kmalloc for now -- one for each buf.
1067 */
1068 for (i = 0; i < buf->rb_max_requests; i++) {
1069 struct rpcrdma_req *req;
1070 struct rpcrdma_rep *rep;
1071
1072 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1073 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1074 /* Typical ~2400b, so rounding up saves work later */
1075 if (len < 4096)
1076 len = 4096;
1077 req = kmalloc(len, GFP_KERNEL);
1078 if (req == NULL) {
1079 dprintk("RPC: %s: request buffer %d alloc"
1080 " failed\n", __func__, i);
1081 rc = -ENOMEM;
1082 goto out;
1083 }
1084 memset(req, 0, sizeof(struct rpcrdma_req));
1085 buf->rb_send_bufs[i] = req;
1086 buf->rb_send_bufs[i]->rl_buffer = buf;
1087
1088 rc = rpcrdma_register_internal(ia, req->rl_base,
1089 len - offsetof(struct rpcrdma_req, rl_base),
1090 &buf->rb_send_bufs[i]->rl_handle,
1091 &buf->rb_send_bufs[i]->rl_iov);
1092 if (rc)
1093 goto out;
1094
1095 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1096
1097 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1098 rep = kmalloc(len, GFP_KERNEL);
1099 if (rep == NULL) {
1100 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1101 __func__, i);
1102 rc = -ENOMEM;
1103 goto out;
1104 }
1105 memset(rep, 0, sizeof(struct rpcrdma_rep));
1106 buf->rb_recv_bufs[i] = rep;
1107 buf->rb_recv_bufs[i]->rr_buffer = buf;
1108 init_waitqueue_head(&rep->rr_unbind);
1109
1110 rc = rpcrdma_register_internal(ia, rep->rr_base,
1111 len - offsetof(struct rpcrdma_rep, rr_base),
1112 &buf->rb_recv_bufs[i]->rr_handle,
1113 &buf->rb_recv_bufs[i]->rr_iov);
1114 if (rc)
1115 goto out;
1116
1117 }
1118 dprintk("RPC: %s: max_requests %d\n",
1119 __func__, buf->rb_max_requests);
1120 /* done */
1121 return 0;
1122out:
1123 rpcrdma_buffer_destroy(buf);
1124 return rc;
1125}
1126
1127/*
1128 * Unregister and destroy buffer memory. Need to deal with
1129 * partial initialization, so it's callable from failed create.
1130 * Must be called before destroying endpoint, as registrations
1131 * reference it.
1132 */
1133void
1134rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1135{
1136 int rc, i;
1137 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001138 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001139
1140 /* clean up in reverse order from create
1141 * 1. recv mr memory (mr free, then kfree)
1142 * 1a. bind mw memory
1143 * 2. send mr memory (mr free, then kfree)
1144 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1145 * 4. arrays
1146 */
1147 dprintk("RPC: %s: entering\n", __func__);
1148
1149 for (i = 0; i < buf->rb_max_requests; i++) {
1150 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1151 rpcrdma_deregister_internal(ia,
1152 buf->rb_recv_bufs[i]->rr_handle,
1153 &buf->rb_recv_bufs[i]->rr_iov);
1154 kfree(buf->rb_recv_bufs[i]);
1155 }
1156 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1157 while (!list_empty(&buf->rb_mws)) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001158 r = list_entry(buf->rb_mws.next,
1159 struct rpcrdma_mw, mw_list);
1160 list_del(&r->mw_list);
1161 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001162 case RPCRDMA_FRMR:
1163 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1164 if (rc)
1165 dprintk("RPC: %s:"
1166 " ib_dereg_mr"
1167 " failed %i\n",
1168 __func__, rc);
1169 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1170 break;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001171 case RPCRDMA_MTHCAFMR:
1172 rc = ib_dealloc_fmr(r->r.fmr);
1173 if (rc)
1174 dprintk("RPC: %s:"
1175 " ib_dealloc_fmr"
1176 " failed %i\n",
1177 __func__, rc);
1178 break;
1179 case RPCRDMA_MEMWINDOWS_ASYNC:
1180 case RPCRDMA_MEMWINDOWS:
1181 rc = ib_dealloc_mw(r->r.mw);
1182 if (rc)
1183 dprintk("RPC: %s:"
1184 " ib_dealloc_mw"
1185 " failed %i\n",
1186 __func__, rc);
1187 break;
1188 default:
1189 break;
1190 }
1191 }
1192 rpcrdma_deregister_internal(ia,
1193 buf->rb_send_bufs[i]->rl_handle,
1194 &buf->rb_send_bufs[i]->rl_iov);
1195 kfree(buf->rb_send_bufs[i]);
1196 }
1197 }
1198
1199 kfree(buf->rb_pool);
1200}
1201
1202/*
1203 * Get a set of request/reply buffers.
1204 *
1205 * Reply buffer (if needed) is attached to send buffer upon return.
1206 * Rule:
1207 * rb_send_index and rb_recv_index MUST always be pointing to the
1208 * *next* available buffer (non-NULL). They are incremented after
1209 * removing buffers, and decremented *before* returning them.
1210 */
1211struct rpcrdma_req *
1212rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1213{
1214 struct rpcrdma_req *req;
1215 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001216 int i;
1217 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001218
1219 spin_lock_irqsave(&buffers->rb_lock, flags);
1220 if (buffers->rb_send_index == buffers->rb_max_requests) {
1221 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1222 dprintk("RPC: %s: out of request buffers\n", __func__);
1223 return ((struct rpcrdma_req *)NULL);
1224 }
1225
1226 req = buffers->rb_send_bufs[buffers->rb_send_index];
1227 if (buffers->rb_send_index < buffers->rb_recv_index) {
1228 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1229 __func__,
1230 buffers->rb_recv_index - buffers->rb_send_index);
1231 req->rl_reply = NULL;
1232 } else {
1233 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1234 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1235 }
1236 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1237 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001238 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001239 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001240 r = list_entry(buffers->rb_mws.next,
1241 struct rpcrdma_mw, mw_list);
1242 list_del(&r->mw_list);
1243 req->rl_segments[i].mr_chunk.rl_mw = r;
1244 } while (--i >= 0);
1245 }
1246 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1247 return req;
1248}
1249
1250/*
1251 * Put request/reply buffers back into pool.
1252 * Pre-decrement counter/array index.
1253 */
1254void
1255rpcrdma_buffer_put(struct rpcrdma_req *req)
1256{
1257 struct rpcrdma_buffer *buffers = req->rl_buffer;
1258 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1259 int i;
1260 unsigned long flags;
1261
1262 BUG_ON(req->rl_nchunks != 0);
1263 spin_lock_irqsave(&buffers->rb_lock, flags);
1264 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1265 req->rl_niovs = 0;
1266 if (req->rl_reply) {
1267 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1268 init_waitqueue_head(&req->rl_reply->rr_unbind);
1269 req->rl_reply->rr_func = NULL;
1270 req->rl_reply = NULL;
1271 }
1272 switch (ia->ri_memreg_strategy) {
Tom Talpey3197d3092008-10-09 15:00:20 -04001273 case RPCRDMA_FRMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001274 case RPCRDMA_MTHCAFMR:
1275 case RPCRDMA_MEMWINDOWS_ASYNC:
1276 case RPCRDMA_MEMWINDOWS:
1277 /*
1278 * Cycle mw's back in reverse order, and "spin" them.
1279 * This delays and scrambles reuse as much as possible.
1280 */
1281 i = 1;
1282 do {
1283 struct rpcrdma_mw **mw;
1284 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1285 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1286 *mw = NULL;
1287 } while (++i < RPCRDMA_MAX_SEGS);
1288 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1289 &buffers->rb_mws);
1290 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1291 break;
1292 default:
1293 break;
1294 }
1295 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1296}
1297
1298/*
1299 * Recover reply buffers from pool.
1300 * This happens when recovering from error conditions.
1301 * Post-increment counter/array index.
1302 */
1303void
1304rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1305{
1306 struct rpcrdma_buffer *buffers = req->rl_buffer;
1307 unsigned long flags;
1308
1309 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1310 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1311 spin_lock_irqsave(&buffers->rb_lock, flags);
1312 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1313 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1314 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1315 }
1316 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1317}
1318
1319/*
1320 * Put reply buffers back into pool when not attached to
1321 * request. This happens in error conditions, and when
1322 * aborting unbinds. Pre-decrement counter/array index.
1323 */
1324void
1325rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1326{
1327 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1328 unsigned long flags;
1329
1330 rep->rr_func = NULL;
1331 spin_lock_irqsave(&buffers->rb_lock, flags);
1332 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1333 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1334}
1335
1336/*
1337 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1338 */
1339
1340int
1341rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1342 struct ib_mr **mrp, struct ib_sge *iov)
1343{
1344 struct ib_phys_buf ipb;
1345 struct ib_mr *mr;
1346 int rc;
1347
1348 /*
1349 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1350 */
1351 iov->addr = ib_dma_map_single(ia->ri_id->device,
1352 va, len, DMA_BIDIRECTIONAL);
1353 iov->length = len;
1354
Tom Talpeybd7ed1d2008-10-09 15:00:09 -04001355 if (ia->ri_have_dma_lkey) {
1356 *mrp = NULL;
1357 iov->lkey = ia->ri_dma_lkey;
1358 return 0;
1359 } else if (ia->ri_bind_mem != NULL) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001360 *mrp = NULL;
1361 iov->lkey = ia->ri_bind_mem->lkey;
1362 return 0;
1363 }
1364
1365 ipb.addr = iov->addr;
1366 ipb.size = iov->length;
1367 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1368 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1369
1370 dprintk("RPC: %s: phys convert: 0x%llx "
1371 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001372 __func__, (unsigned long long)ipb.addr,
1373 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001374
1375 if (IS_ERR(mr)) {
1376 *mrp = NULL;
1377 rc = PTR_ERR(mr);
1378 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1379 } else {
1380 *mrp = mr;
1381 iov->lkey = mr->lkey;
1382 rc = 0;
1383 }
1384
1385 return rc;
1386}
1387
1388int
1389rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1390 struct ib_mr *mr, struct ib_sge *iov)
1391{
1392 int rc;
1393
1394 ib_dma_unmap_single(ia->ri_id->device,
1395 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1396
1397 if (NULL == mr)
1398 return 0;
1399
1400 rc = ib_dereg_mr(mr);
1401 if (rc)
1402 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1403 return rc;
1404}
1405
1406/*
1407 * Wrappers for chunk registration, shared by read/write chunk code.
1408 */
1409
1410static void
1411rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1412{
1413 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1414 seg->mr_dmalen = seg->mr_len;
1415 if (seg->mr_page)
1416 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1417 seg->mr_page, offset_in_page(seg->mr_offset),
1418 seg->mr_dmalen, seg->mr_dir);
1419 else
1420 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1421 seg->mr_offset,
1422 seg->mr_dmalen, seg->mr_dir);
1423}
1424
1425static void
1426rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1427{
1428 if (seg->mr_page)
1429 ib_dma_unmap_page(ia->ri_id->device,
1430 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1431 else
1432 ib_dma_unmap_single(ia->ri_id->device,
1433 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1434}
1435
Tom Talpey8d4ba032008-10-09 14:59:49 -04001436static int
Tom Talpey3197d3092008-10-09 15:00:20 -04001437rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1438 int *nsegs, int writing, struct rpcrdma_ia *ia,
1439 struct rpcrdma_xprt *r_xprt)
1440{
1441 struct rpcrdma_mr_seg *seg1 = seg;
1442 struct ib_send_wr frmr_wr, *bad_wr;
1443 u8 key;
1444 int len, pageoff;
1445 int i, rc;
1446
1447 pageoff = offset_in_page(seg1->mr_offset);
1448 seg1->mr_offset -= pageoff; /* start of page */
1449 seg1->mr_len += pageoff;
1450 len = -pageoff;
1451 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1452 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1453 for (i = 0; i < *nsegs;) {
1454 rpcrdma_map_one(ia, seg, writing);
1455 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1456 len += seg->mr_len;
1457 ++seg;
1458 ++i;
1459 /* Check for holes */
1460 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1461 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1462 break;
1463 }
1464 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1465 __func__, seg1->mr_chunk.rl_mw, i);
1466
1467 /* Bump the key */
1468 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1469 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1470
1471 /* Prepare FRMR WR */
1472 memset(&frmr_wr, 0, sizeof frmr_wr);
1473 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1474 frmr_wr.send_flags = 0; /* unsignaled */
1475 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1476 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1477 frmr_wr.wr.fast_reg.page_list_len = i;
1478 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1479 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1480 frmr_wr.wr.fast_reg.access_flags = (writing ?
1481 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1482 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1483 DECR_CQCOUNT(&r_xprt->rx_ep);
1484
1485 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1486
1487 if (rc) {
1488 dprintk("RPC: %s: failed ib_post_send for register,"
1489 " status %i\n", __func__, rc);
1490 while (i--)
1491 rpcrdma_unmap_one(ia, --seg);
1492 } else {
1493 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1494 seg1->mr_base = seg1->mr_dma + pageoff;
1495 seg1->mr_nsegs = i;
1496 seg1->mr_len = len;
1497 }
1498 *nsegs = i;
1499 return rc;
1500}
1501
1502static int
1503rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1504 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1505{
1506 struct rpcrdma_mr_seg *seg1 = seg;
1507 struct ib_send_wr invalidate_wr, *bad_wr;
1508 int rc;
1509
1510 while (seg1->mr_nsegs--)
1511 rpcrdma_unmap_one(ia, seg++);
1512
1513 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1514 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1515 invalidate_wr.send_flags = 0; /* unsignaled */
1516 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1517 DECR_CQCOUNT(&r_xprt->rx_ep);
1518
1519 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1520 if (rc)
1521 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1522 " status %i\n", __func__, rc);
1523 return rc;
1524}
1525
1526static int
Tom Talpey8d4ba032008-10-09 14:59:49 -04001527rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1528 int *nsegs, int writing, struct rpcrdma_ia *ia)
1529{
1530 struct rpcrdma_mr_seg *seg1 = seg;
1531 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1532 int len, pageoff, i, rc;
1533
1534 pageoff = offset_in_page(seg1->mr_offset);
1535 seg1->mr_offset -= pageoff; /* start of page */
1536 seg1->mr_len += pageoff;
1537 len = -pageoff;
1538 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1539 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1540 for (i = 0; i < *nsegs;) {
1541 rpcrdma_map_one(ia, seg, writing);
1542 physaddrs[i] = seg->mr_dma;
1543 len += seg->mr_len;
1544 ++seg;
1545 ++i;
1546 /* Check for holes */
1547 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1548 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1549 break;
1550 }
1551 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1552 physaddrs, i, seg1->mr_dma);
1553 if (rc) {
1554 dprintk("RPC: %s: failed ib_map_phys_fmr "
1555 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1556 len, (unsigned long long)seg1->mr_dma,
1557 pageoff, i, rc);
1558 while (i--)
1559 rpcrdma_unmap_one(ia, --seg);
1560 } else {
1561 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1562 seg1->mr_base = seg1->mr_dma + pageoff;
1563 seg1->mr_nsegs = i;
1564 seg1->mr_len = len;
1565 }
1566 *nsegs = i;
1567 return rc;
1568}
1569
1570static int
1571rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1572 struct rpcrdma_ia *ia)
1573{
1574 struct rpcrdma_mr_seg *seg1 = seg;
1575 LIST_HEAD(l);
1576 int rc;
1577
1578 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1579 rc = ib_unmap_fmr(&l);
1580 while (seg1->mr_nsegs--)
1581 rpcrdma_unmap_one(ia, seg++);
1582 if (rc)
1583 dprintk("RPC: %s: failed ib_unmap_fmr,"
1584 " status %i\n", __func__, rc);
1585 return rc;
1586}
1587
1588static int
1589rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1590 int *nsegs, int writing, struct rpcrdma_ia *ia,
1591 struct rpcrdma_xprt *r_xprt)
1592{
1593 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1594 IB_ACCESS_REMOTE_READ);
1595 struct ib_mw_bind param;
1596 int rc;
1597
1598 *nsegs = 1;
1599 rpcrdma_map_one(ia, seg, writing);
1600 param.mr = ia->ri_bind_mem;
1601 param.wr_id = 0ULL; /* no send cookie */
1602 param.addr = seg->mr_dma;
1603 param.length = seg->mr_len;
1604 param.send_flags = 0;
1605 param.mw_access_flags = mem_priv;
1606
1607 DECR_CQCOUNT(&r_xprt->rx_ep);
1608 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1609 if (rc) {
1610 dprintk("RPC: %s: failed ib_bind_mw "
1611 "%u@0x%llx status %i\n",
1612 __func__, seg->mr_len,
1613 (unsigned long long)seg->mr_dma, rc);
1614 rpcrdma_unmap_one(ia, seg);
1615 } else {
1616 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1617 seg->mr_base = param.addr;
1618 seg->mr_nsegs = 1;
1619 }
1620 return rc;
1621}
1622
1623static int
1624rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1625 struct rpcrdma_ia *ia,
1626 struct rpcrdma_xprt *r_xprt, void **r)
1627{
1628 struct ib_mw_bind param;
1629 LIST_HEAD(l);
1630 int rc;
1631
1632 BUG_ON(seg->mr_nsegs != 1);
1633 param.mr = ia->ri_bind_mem;
1634 param.addr = 0ULL; /* unbind */
1635 param.length = 0;
1636 param.mw_access_flags = 0;
1637 if (*r) {
1638 param.wr_id = (u64) (unsigned long) *r;
1639 param.send_flags = IB_SEND_SIGNALED;
1640 INIT_CQCOUNT(&r_xprt->rx_ep);
1641 } else {
1642 param.wr_id = 0ULL;
1643 param.send_flags = 0;
1644 DECR_CQCOUNT(&r_xprt->rx_ep);
1645 }
1646 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1647 rpcrdma_unmap_one(ia, seg);
1648 if (rc)
1649 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1650 " status %i\n", __func__, rc);
1651 else
1652 *r = NULL; /* will upcall on completion */
1653 return rc;
1654}
1655
1656static int
1657rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia)
1659{
1660 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1661 IB_ACCESS_REMOTE_READ);
1662 struct rpcrdma_mr_seg *seg1 = seg;
1663 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1664 int len, i, rc = 0;
1665
1666 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1667 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1668 for (len = 0, i = 0; i < *nsegs;) {
1669 rpcrdma_map_one(ia, seg, writing);
1670 ipb[i].addr = seg->mr_dma;
1671 ipb[i].size = seg->mr_len;
1672 len += seg->mr_len;
1673 ++seg;
1674 ++i;
1675 /* Check for holes */
1676 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1677 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1678 break;
1679 }
1680 seg1->mr_base = seg1->mr_dma;
1681 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1682 ipb, i, mem_priv, &seg1->mr_base);
1683 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1684 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1685 dprintk("RPC: %s: failed ib_reg_phys_mr "
1686 "%u@0x%llx (%d)... status %i\n",
1687 __func__, len,
1688 (unsigned long long)seg1->mr_dma, i, rc);
1689 while (i--)
1690 rpcrdma_unmap_one(ia, --seg);
1691 } else {
1692 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1693 seg1->mr_nsegs = i;
1694 seg1->mr_len = len;
1695 }
1696 *nsegs = i;
1697 return rc;
1698}
1699
1700static int
1701rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1702 struct rpcrdma_ia *ia)
1703{
1704 struct rpcrdma_mr_seg *seg1 = seg;
1705 int rc;
1706
1707 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1708 seg1->mr_chunk.rl_mr = NULL;
1709 while (seg1->mr_nsegs--)
1710 rpcrdma_unmap_one(ia, seg++);
1711 if (rc)
1712 dprintk("RPC: %s: failed ib_dereg_mr,"
1713 " status %i\n", __func__, rc);
1714 return rc;
1715}
1716
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001717int
1718rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1719 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1720{
1721 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001722 int rc = 0;
1723
1724 switch (ia->ri_memreg_strategy) {
1725
1726#if RPCRDMA_PERSISTENT_REGISTRATION
1727 case RPCRDMA_ALLPHYSICAL:
1728 rpcrdma_map_one(ia, seg, writing);
1729 seg->mr_rkey = ia->ri_bind_mem->rkey;
1730 seg->mr_base = seg->mr_dma;
1731 seg->mr_nsegs = 1;
1732 nsegs = 1;
1733 break;
1734#endif
1735
Tom Talpey3197d3092008-10-09 15:00:20 -04001736 /* Registration using frmr registration */
1737 case RPCRDMA_FRMR:
1738 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1739 break;
1740
Tom Talpey8d4ba032008-10-09 14:59:49 -04001741 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001742 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001743 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001744 break;
1745
1746 /* Registration using memory windows */
1747 case RPCRDMA_MEMWINDOWS_ASYNC:
1748 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001749 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001750 break;
1751
1752 /* Default registration each time */
1753 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001754 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001755 break;
1756 }
1757 if (rc)
1758 return -1;
1759
1760 return nsegs;
1761}
1762
1763int
1764rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1765 struct rpcrdma_xprt *r_xprt, void *r)
1766{
1767 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001768 int nsegs = seg->mr_nsegs, rc;
1769
1770 switch (ia->ri_memreg_strategy) {
1771
1772#if RPCRDMA_PERSISTENT_REGISTRATION
1773 case RPCRDMA_ALLPHYSICAL:
1774 BUG_ON(nsegs != 1);
1775 rpcrdma_unmap_one(ia, seg);
1776 rc = 0;
1777 break;
1778#endif
1779
Tom Talpey3197d3092008-10-09 15:00:20 -04001780 case RPCRDMA_FRMR:
1781 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1782 break;
1783
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001784 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001785 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001786 break;
1787
1788 case RPCRDMA_MEMWINDOWS_ASYNC:
1789 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001790 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001791 break;
1792
1793 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001794 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001795 break;
1796 }
1797 if (r) {
1798 struct rpcrdma_rep *rep = r;
1799 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1800 rep->rr_func = NULL;
1801 func(rep); /* dereg done, callback now */
1802 }
1803 return nsegs;
1804}
1805
1806/*
1807 * Prepost any receive buffer, then post send.
1808 *
1809 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1810 */
1811int
1812rpcrdma_ep_post(struct rpcrdma_ia *ia,
1813 struct rpcrdma_ep *ep,
1814 struct rpcrdma_req *req)
1815{
1816 struct ib_send_wr send_wr, *send_wr_fail;
1817 struct rpcrdma_rep *rep = req->rl_reply;
1818 int rc;
1819
1820 if (rep) {
1821 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1822 if (rc)
1823 goto out;
1824 req->rl_reply = NULL;
1825 }
1826
1827 send_wr.next = NULL;
1828 send_wr.wr_id = 0ULL; /* no send cookie */
1829 send_wr.sg_list = req->rl_send_iov;
1830 send_wr.num_sge = req->rl_niovs;
1831 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001832 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1833 ib_dma_sync_single_for_device(ia->ri_id->device,
1834 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1835 DMA_TO_DEVICE);
1836 ib_dma_sync_single_for_device(ia->ri_id->device,
1837 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1838 DMA_TO_DEVICE);
1839 ib_dma_sync_single_for_device(ia->ri_id->device,
1840 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1841 DMA_TO_DEVICE);
1842
1843 if (DECR_CQCOUNT(ep) > 0)
1844 send_wr.send_flags = 0;
1845 else { /* Provider must take a send completion every now and then */
1846 INIT_CQCOUNT(ep);
1847 send_wr.send_flags = IB_SEND_SIGNALED;
1848 }
1849
1850 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1851 if (rc)
1852 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1853 rc);
1854out:
1855 return rc;
1856}
1857
1858/*
1859 * (Re)post a receive buffer.
1860 */
1861int
1862rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1863 struct rpcrdma_ep *ep,
1864 struct rpcrdma_rep *rep)
1865{
1866 struct ib_recv_wr recv_wr, *recv_wr_fail;
1867 int rc;
1868
1869 recv_wr.next = NULL;
1870 recv_wr.wr_id = (u64) (unsigned long) rep;
1871 recv_wr.sg_list = &rep->rr_iov;
1872 recv_wr.num_sge = 1;
1873
1874 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1875 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1876
1877 DECR_CQCOUNT(ep);
1878 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1879
1880 if (rc)
1881 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1882 rc);
1883 return rc;
1884}