blob: d04208a02f678dd74dcd93b60a191800298f22da [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040052#include "xprt_rdma.h"
53
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040054/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
346 }
347
348 return 0;
349}
350
351static struct rdma_cm_id *
352rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
354{
355 struct rdma_cm_id *id;
356 int rc;
357
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
364 }
365
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
372 }
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
377
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
384 }
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
389
390 return id;
391
392out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
395}
396
397/*
398 * Drain any cq, prior to teardown.
399 */
400static void
401rpcrdma_clean_cq(struct ib_cq *cq)
402{
403 struct ib_wc wc;
404 int count = 0;
405
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
408
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
412}
413
414/*
415 * Exported functions.
416 */
417
418/*
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
422 */
423int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{
426 int rc;
427 struct rpcrdma_ia *ia = &xprt->rx_ia;
428
429 init_completion(&ia->ri_done);
430
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id);
434 goto out1;
435 }
436
437 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438 if (IS_ERR(ia->ri_pd)) {
439 rc = PTR_ERR(ia->ri_pd);
440 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
441 __func__, rc);
442 goto out2;
443 }
444
445 /*
446 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding
449 * for the specific bytes targeted during each RPC operation, and
450 * revoked after the corresponding completion similar to a storage
451 * adapter.
452 */
453 if (memreg > RPCRDMA_REGISTER) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455 switch (memreg) {
456#if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE;
459 mem_priv |= IB_ACCESS_REMOTE_READ;
460 break;
461#endif
462 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND;
465 break;
466 default:
467 break;
468 }
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for "
472 "phys register failed with %lX\n\t"
473 "Will continue with degraded performance\n",
474 __func__, PTR_ERR(ia->ri_bind_mem));
475 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL;
477 }
478 }
479
480 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg;
482
483 return 0;
484out2:
485 rdma_destroy_id(ia->ri_id);
486out1:
487 return rc;
488}
489
490/*
491 * Clean up/close an IA.
492 * o if event handles and PD have been initialized, free them.
493 * o close the IA
494 */
495void
496rpcrdma_ia_close(struct rpcrdma_ia *ia)
497{
498 int rc;
499
500 dprintk("RPC: %s: entering\n", __func__);
501 if (ia->ri_bind_mem != NULL) {
502 rc = ib_dereg_mr(ia->ri_bind_mem);
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc);
505 }
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507 rdma_destroy_qp(ia->ri_id);
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc);
512 }
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
515}
516
517/*
518 * Create unconnected endpoint.
519 */
520int
521rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 struct rpcrdma_create_data_internal *cdata)
523{
524 struct ib_device_attr devattr;
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400525 int rc, err;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400526
527 rc = ib_query_device(ia->ri_id->device, &devattr);
528 if (rc) {
529 dprintk("RPC: %s: ib_query_device failed %d\n",
530 __func__, rc);
531 return rc;
532 }
533
534 /* check provider's send/recv wr limits */
535 if (cdata->max_requests > devattr.max_qp_wr)
536 cdata->max_requests = devattr.max_qp_wr;
537
538 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539 ep->rep_attr.qp_context = ep;
540 /* send_cq and recv_cq initialized below */
541 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) {
544 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */
547 ep->rep_attr.cap.max_send_wr++;
548 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550 return -EINVAL;
551 break;
552 default:
553 break;
554 }
555 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557 ep->rep_attr.cap.max_recv_sge = 1;
558 ep->rep_attr.cap.max_inline_data = 0;
559 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560 ep->rep_attr.qp_type = IB_QPT_RC;
561 ep->rep_attr.port_num = ~0;
562
563 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
564 "iovs: send %d recv %d\n",
565 __func__,
566 ep->rep_attr.cap.max_send_wr,
567 ep->rep_attr.cap.max_recv_wr,
568 ep->rep_attr.cap.max_send_sge,
569 ep->rep_attr.cap.max_recv_sge);
570
571 /* set trigger for requesting send completion */
572 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
573 switch (ia->ri_memreg_strategy) {
574 case RPCRDMA_MEMWINDOWS_ASYNC:
575 case RPCRDMA_MEMWINDOWS:
576 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577 break;
578 default:
579 break;
580 }
581 if (ep->rep_cqinit <= 2)
582 ep->rep_cqinit = 0;
583 INIT_CQCOUNT(ep);
584 ep->rep_ia = ia;
585 init_waitqueue_head(&ep->rep_connect_wait);
586
587 /*
588 * Create a single cq for receive dto and mw_bind (only ever
589 * care about unbind, really). Send completions are suppressed.
590 * Use single threaded tasklet upcalls to maintain ordering.
591 */
592 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593 rpcrdma_cq_async_error_upcall, NULL,
594 ep->rep_attr.cap.max_recv_wr +
595 ep->rep_attr.cap.max_send_wr + 1, 0);
596 if (IS_ERR(ep->rep_cq)) {
597 rc = PTR_ERR(ep->rep_cq);
598 dprintk("RPC: %s: ib_create_cq failed: %i\n",
599 __func__, rc);
600 goto out1;
601 }
602
603 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604 if (rc) {
605 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
606 __func__, rc);
607 goto out2;
608 }
609
610 ep->rep_attr.send_cq = ep->rep_cq;
611 ep->rep_attr.recv_cq = ep->rep_cq;
612
613 /* Initialize cma parameters */
614
615 /* RPC/RDMA does not use private data */
616 ep->rep_remote_cma.private_data = NULL;
617 ep->rep_remote_cma.private_data_len = 0;
618
619 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) {
621 case RPCRDMA_BOUNCEBUFFERS:
622 ep->rep_remote_cma.responder_resources = 0;
623 break;
624 case RPCRDMA_MTHCAFMR:
625 case RPCRDMA_REGISTER:
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631#if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633#endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
639 }
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
643
644 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0;
646 ep->rep_remote_cma.rnr_retry_count = 0;
647
648 return 0;
649
650out2:
Chuck Lever5d40a8a2007-10-26 13:30:54 -0400651 err = ib_destroy_cq(ep->rep_cq);
652 if (err)
653 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
654 __func__, err);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400655out1:
656 return rc;
657}
658
659/*
660 * rpcrdma_ep_destroy
661 *
662 * Disconnect and destroy endpoint. After this, the only
663 * valid operations on the ep are to free it (if dynamically
664 * allocated) or re-create it.
665 *
666 * The caller's error handling must be sure to not leak the endpoint
667 * if this function fails.
668 */
669int
670rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
671{
672 int rc;
673
674 dprintk("RPC: %s: entering, connected is %d\n",
675 __func__, ep->rep_connected);
676
677 if (ia->ri_id->qp) {
678 rc = rpcrdma_ep_disconnect(ep, ia);
679 if (rc)
680 dprintk("RPC: %s: rpcrdma_ep_disconnect"
681 " returned %i\n", __func__, rc);
682 }
683
684 ep->rep_func = NULL;
685
686 /* padding - could be done in rpcrdma_buffer_destroy... */
687 if (ep->rep_pad_mr) {
688 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
689 ep->rep_pad_mr = NULL;
690 }
691
692 if (ia->ri_id->qp) {
693 rdma_destroy_qp(ia->ri_id);
694 ia->ri_id->qp = NULL;
695 }
696
697 rpcrdma_clean_cq(ep->rep_cq);
698 rc = ib_destroy_cq(ep->rep_cq);
699 if (rc)
700 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
701 __func__, rc);
702
703 return rc;
704}
705
706/*
707 * Connect unconnected endpoint.
708 */
709int
710rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
711{
712 struct rdma_cm_id *id;
713 int rc = 0;
714 int retry_count = 0;
715 int reconnect = (ep->rep_connected != 0);
716
717 if (reconnect) {
718 struct rpcrdma_xprt *xprt;
719retry:
720 rc = rpcrdma_ep_disconnect(ep, ia);
721 if (rc && rc != -ENOTCONN)
722 dprintk("RPC: %s: rpcrdma_ep_disconnect"
723 " status %i\n", __func__, rc);
724 rpcrdma_clean_cq(ep->rep_cq);
725
726 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
727 id = rpcrdma_create_id(xprt, ia,
728 (struct sockaddr *)&xprt->rx_data.addr);
729 if (IS_ERR(id)) {
730 rc = PTR_ERR(id);
731 goto out;
732 }
733 /* TEMP TEMP TEMP - fail if new device:
734 * Deregister/remarshal *all* requests!
735 * Close and recreate adapter, pd, etc!
736 * Re-determine all attributes still sane!
737 * More stuff I haven't thought of!
738 * Rrrgh!
739 */
740 if (ia->ri_id->device != id->device) {
741 printk("RPC: %s: can't reconnect on "
742 "different device!\n", __func__);
743 rdma_destroy_id(id);
744 rc = -ENETDOWN;
745 goto out;
746 }
747 /* END TEMP */
748 rdma_destroy_id(ia->ri_id);
749 ia->ri_id = id;
750 }
751
752 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
753 if (rc) {
754 dprintk("RPC: %s: rdma_create_qp failed %i\n",
755 __func__, rc);
756 goto out;
757 }
758
759/* XXX Tavor device performs badly with 2K MTU! */
760if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
761 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
762 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
763 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
764 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
765 struct ib_qp_attr attr = {
766 .path_mtu = IB_MTU_1024
767 };
768 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
769 }
770}
771
772 /* Theoretically a client initiator_depth > 0 is not needed,
773 * but many peers fail to complete the connection unless they
774 * == responder_resources! */
775 if (ep->rep_remote_cma.initiator_depth !=
776 ep->rep_remote_cma.responder_resources)
777 ep->rep_remote_cma.initiator_depth =
778 ep->rep_remote_cma.responder_resources;
779
780 ep->rep_connected = 0;
781
782 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
783 if (rc) {
784 dprintk("RPC: %s: rdma_connect() failed with %i\n",
785 __func__, rc);
786 goto out;
787 }
788
789 if (reconnect)
790 return 0;
791
792 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
793
794 /*
795 * Check state. A non-peer reject indicates no listener
796 * (ECONNREFUSED), which may be a transient state. All
797 * others indicate a transport condition which has already
798 * undergone a best-effort.
799 */
800 if (ep->rep_connected == -ECONNREFUSED
801 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
802 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
803 goto retry;
804 }
805 if (ep->rep_connected <= 0) {
806 /* Sometimes, the only way to reliably connect to remote
807 * CMs is to use same nonzero values for ORD and IRD. */
808 ep->rep_remote_cma.initiator_depth =
809 ep->rep_remote_cma.responder_resources;
810 if (ep->rep_remote_cma.initiator_depth == 0)
811 ++ep->rep_remote_cma.initiator_depth;
812 if (ep->rep_remote_cma.responder_resources == 0)
813 ++ep->rep_remote_cma.responder_resources;
814 if (retry_count++ == 0)
815 goto retry;
816 rc = ep->rep_connected;
817 } else {
818 dprintk("RPC: %s: connected\n", __func__);
819 }
820
821out:
822 if (rc)
823 ep->rep_connected = rc;
824 return rc;
825}
826
827/*
828 * rpcrdma_ep_disconnect
829 *
830 * This is separate from destroy to facilitate the ability
831 * to reconnect without recreating the endpoint.
832 *
833 * This call is not reentrant, and must not be made in parallel
834 * on the same endpoint.
835 */
836int
837rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
838{
839 int rc;
840
841 rpcrdma_clean_cq(ep->rep_cq);
842 rc = rdma_disconnect(ia->ri_id);
843 if (!rc) {
844 /* returns without wait if not connected */
845 wait_event_interruptible(ep->rep_connect_wait,
846 ep->rep_connected != 1);
847 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
848 (ep->rep_connected == 1) ? "still " : "dis");
849 } else {
850 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
851 ep->rep_connected = rc;
852 }
853 return rc;
854}
855
856/*
857 * Initialize buffer memory
858 */
859int
860rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
861 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
862{
863 char *p;
864 size_t len;
865 int i, rc;
Tom Talpey8d4ba032008-10-09 14:59:49 -0400866 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400867
868 buf->rb_max_requests = cdata->max_requests;
869 spin_lock_init(&buf->rb_lock);
870 atomic_set(&buf->rb_credits, 1);
871
872 /* Need to allocate:
873 * 1. arrays for send and recv pointers
874 * 2. arrays of struct rpcrdma_req to fill in pointers
875 * 3. array of struct rpcrdma_rep for replies
876 * 4. padding, if any
Tom Talpey8d4ba032008-10-09 14:59:49 -0400877 * 5. mw's or fmr's, if any
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400878 * Send/recv buffers in req/rep need to be registered
879 */
880
881 len = buf->rb_max_requests *
882 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
883 len += cdata->padding;
884 switch (ia->ri_memreg_strategy) {
885 case RPCRDMA_MTHCAFMR:
886 /* TBD we are perhaps overallocating here */
887 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
888 sizeof(struct rpcrdma_mw);
889 break;
890 case RPCRDMA_MEMWINDOWS_ASYNC:
891 case RPCRDMA_MEMWINDOWS:
892 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
893 sizeof(struct rpcrdma_mw);
894 break;
895 default:
896 break;
897 }
898
899 /* allocate 1, 4 and 5 in one shot */
900 p = kzalloc(len, GFP_KERNEL);
901 if (p == NULL) {
902 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
903 __func__, len);
904 rc = -ENOMEM;
905 goto out;
906 }
907 buf->rb_pool = p; /* for freeing it later */
908
909 buf->rb_send_bufs = (struct rpcrdma_req **) p;
910 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
911 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
912 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
913
914 /*
915 * Register the zeroed pad buffer, if any.
916 */
917 if (cdata->padding) {
918 rc = rpcrdma_register_internal(ia, p, cdata->padding,
919 &ep->rep_pad_mr, &ep->rep_pad);
920 if (rc)
921 goto out;
922 }
923 p += cdata->padding;
924
925 /*
926 * Allocate the fmr's, or mw's for mw_bind chunk registration.
927 * We "cycle" the mw's in order to minimize rkey reuse,
928 * and also reduce unbind-to-bind collision.
929 */
930 INIT_LIST_HEAD(&buf->rb_mws);
Tom Talpey8d4ba032008-10-09 14:59:49 -0400931 r = (struct rpcrdma_mw *)p;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400932 switch (ia->ri_memreg_strategy) {
933 case RPCRDMA_MTHCAFMR:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400934 /* TBD we are perhaps overallocating here */
935 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
Tom Talpey8d4ba032008-10-09 14:59:49 -0400936 static struct ib_fmr_attr fa =
937 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400938 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
939 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
940 &fa);
941 if (IS_ERR(r->r.fmr)) {
942 rc = PTR_ERR(r->r.fmr);
943 dprintk("RPC: %s: ib_alloc_fmr"
944 " failed %i\n", __func__, rc);
945 goto out;
946 }
947 list_add(&r->mw_list, &buf->rb_mws);
948 ++r;
949 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400950 break;
951 case RPCRDMA_MEMWINDOWS_ASYNC:
952 case RPCRDMA_MEMWINDOWS:
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400953 /* Allocate one extra request's worth, for full cycling */
954 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
955 r->r.mw = ib_alloc_mw(ia->ri_pd);
956 if (IS_ERR(r->r.mw)) {
957 rc = PTR_ERR(r->r.mw);
958 dprintk("RPC: %s: ib_alloc_mw"
959 " failed %i\n", __func__, rc);
960 goto out;
961 }
962 list_add(&r->mw_list, &buf->rb_mws);
963 ++r;
964 }
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -0400965 break;
966 default:
967 break;
968 }
969
970 /*
971 * Allocate/init the request/reply buffers. Doing this
972 * using kmalloc for now -- one for each buf.
973 */
974 for (i = 0; i < buf->rb_max_requests; i++) {
975 struct rpcrdma_req *req;
976 struct rpcrdma_rep *rep;
977
978 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
979 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
980 /* Typical ~2400b, so rounding up saves work later */
981 if (len < 4096)
982 len = 4096;
983 req = kmalloc(len, GFP_KERNEL);
984 if (req == NULL) {
985 dprintk("RPC: %s: request buffer %d alloc"
986 " failed\n", __func__, i);
987 rc = -ENOMEM;
988 goto out;
989 }
990 memset(req, 0, sizeof(struct rpcrdma_req));
991 buf->rb_send_bufs[i] = req;
992 buf->rb_send_bufs[i]->rl_buffer = buf;
993
994 rc = rpcrdma_register_internal(ia, req->rl_base,
995 len - offsetof(struct rpcrdma_req, rl_base),
996 &buf->rb_send_bufs[i]->rl_handle,
997 &buf->rb_send_bufs[i]->rl_iov);
998 if (rc)
999 goto out;
1000
1001 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1002
1003 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1004 rep = kmalloc(len, GFP_KERNEL);
1005 if (rep == NULL) {
1006 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1007 __func__, i);
1008 rc = -ENOMEM;
1009 goto out;
1010 }
1011 memset(rep, 0, sizeof(struct rpcrdma_rep));
1012 buf->rb_recv_bufs[i] = rep;
1013 buf->rb_recv_bufs[i]->rr_buffer = buf;
1014 init_waitqueue_head(&rep->rr_unbind);
1015
1016 rc = rpcrdma_register_internal(ia, rep->rr_base,
1017 len - offsetof(struct rpcrdma_rep, rr_base),
1018 &buf->rb_recv_bufs[i]->rr_handle,
1019 &buf->rb_recv_bufs[i]->rr_iov);
1020 if (rc)
1021 goto out;
1022
1023 }
1024 dprintk("RPC: %s: max_requests %d\n",
1025 __func__, buf->rb_max_requests);
1026 /* done */
1027 return 0;
1028out:
1029 rpcrdma_buffer_destroy(buf);
1030 return rc;
1031}
1032
1033/*
1034 * Unregister and destroy buffer memory. Need to deal with
1035 * partial initialization, so it's callable from failed create.
1036 * Must be called before destroying endpoint, as registrations
1037 * reference it.
1038 */
1039void
1040rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1041{
1042 int rc, i;
1043 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
Tom Talpey8d4ba032008-10-09 14:59:49 -04001044 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001045
1046 /* clean up in reverse order from create
1047 * 1. recv mr memory (mr free, then kfree)
1048 * 1a. bind mw memory
1049 * 2. send mr memory (mr free, then kfree)
1050 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1051 * 4. arrays
1052 */
1053 dprintk("RPC: %s: entering\n", __func__);
1054
1055 for (i = 0; i < buf->rb_max_requests; i++) {
1056 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1057 rpcrdma_deregister_internal(ia,
1058 buf->rb_recv_bufs[i]->rr_handle,
1059 &buf->rb_recv_bufs[i]->rr_iov);
1060 kfree(buf->rb_recv_bufs[i]);
1061 }
1062 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1063 while (!list_empty(&buf->rb_mws)) {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001064 r = list_entry(buf->rb_mws.next,
1065 struct rpcrdma_mw, mw_list);
1066 list_del(&r->mw_list);
1067 switch (ia->ri_memreg_strategy) {
1068 case RPCRDMA_MTHCAFMR:
1069 rc = ib_dealloc_fmr(r->r.fmr);
1070 if (rc)
1071 dprintk("RPC: %s:"
1072 " ib_dealloc_fmr"
1073 " failed %i\n",
1074 __func__, rc);
1075 break;
1076 case RPCRDMA_MEMWINDOWS_ASYNC:
1077 case RPCRDMA_MEMWINDOWS:
1078 rc = ib_dealloc_mw(r->r.mw);
1079 if (rc)
1080 dprintk("RPC: %s:"
1081 " ib_dealloc_mw"
1082 " failed %i\n",
1083 __func__, rc);
1084 break;
1085 default:
1086 break;
1087 }
1088 }
1089 rpcrdma_deregister_internal(ia,
1090 buf->rb_send_bufs[i]->rl_handle,
1091 &buf->rb_send_bufs[i]->rl_iov);
1092 kfree(buf->rb_send_bufs[i]);
1093 }
1094 }
1095
1096 kfree(buf->rb_pool);
1097}
1098
1099/*
1100 * Get a set of request/reply buffers.
1101 *
1102 * Reply buffer (if needed) is attached to send buffer upon return.
1103 * Rule:
1104 * rb_send_index and rb_recv_index MUST always be pointing to the
1105 * *next* available buffer (non-NULL). They are incremented after
1106 * removing buffers, and decremented *before* returning them.
1107 */
1108struct rpcrdma_req *
1109rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1110{
1111 struct rpcrdma_req *req;
1112 unsigned long flags;
Tom Talpey8d4ba032008-10-09 14:59:49 -04001113 int i;
1114 struct rpcrdma_mw *r;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001115
1116 spin_lock_irqsave(&buffers->rb_lock, flags);
1117 if (buffers->rb_send_index == buffers->rb_max_requests) {
1118 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1119 dprintk("RPC: %s: out of request buffers\n", __func__);
1120 return ((struct rpcrdma_req *)NULL);
1121 }
1122
1123 req = buffers->rb_send_bufs[buffers->rb_send_index];
1124 if (buffers->rb_send_index < buffers->rb_recv_index) {
1125 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1126 __func__,
1127 buffers->rb_recv_index - buffers->rb_send_index);
1128 req->rl_reply = NULL;
1129 } else {
1130 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1131 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1132 }
1133 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1134 if (!list_empty(&buffers->rb_mws)) {
Tom Talpey8d4ba032008-10-09 14:59:49 -04001135 i = RPCRDMA_MAX_SEGS - 1;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001136 do {
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001137 r = list_entry(buffers->rb_mws.next,
1138 struct rpcrdma_mw, mw_list);
1139 list_del(&r->mw_list);
1140 req->rl_segments[i].mr_chunk.rl_mw = r;
1141 } while (--i >= 0);
1142 }
1143 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1144 return req;
1145}
1146
1147/*
1148 * Put request/reply buffers back into pool.
1149 * Pre-decrement counter/array index.
1150 */
1151void
1152rpcrdma_buffer_put(struct rpcrdma_req *req)
1153{
1154 struct rpcrdma_buffer *buffers = req->rl_buffer;
1155 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1156 int i;
1157 unsigned long flags;
1158
1159 BUG_ON(req->rl_nchunks != 0);
1160 spin_lock_irqsave(&buffers->rb_lock, flags);
1161 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1162 req->rl_niovs = 0;
1163 if (req->rl_reply) {
1164 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1165 init_waitqueue_head(&req->rl_reply->rr_unbind);
1166 req->rl_reply->rr_func = NULL;
1167 req->rl_reply = NULL;
1168 }
1169 switch (ia->ri_memreg_strategy) {
1170 case RPCRDMA_MTHCAFMR:
1171 case RPCRDMA_MEMWINDOWS_ASYNC:
1172 case RPCRDMA_MEMWINDOWS:
1173 /*
1174 * Cycle mw's back in reverse order, and "spin" them.
1175 * This delays and scrambles reuse as much as possible.
1176 */
1177 i = 1;
1178 do {
1179 struct rpcrdma_mw **mw;
1180 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1181 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1182 *mw = NULL;
1183 } while (++i < RPCRDMA_MAX_SEGS);
1184 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1185 &buffers->rb_mws);
1186 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1187 break;
1188 default:
1189 break;
1190 }
1191 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1192}
1193
1194/*
1195 * Recover reply buffers from pool.
1196 * This happens when recovering from error conditions.
1197 * Post-increment counter/array index.
1198 */
1199void
1200rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1201{
1202 struct rpcrdma_buffer *buffers = req->rl_buffer;
1203 unsigned long flags;
1204
1205 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1206 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1207 spin_lock_irqsave(&buffers->rb_lock, flags);
1208 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1209 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1210 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1211 }
1212 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1213}
1214
1215/*
1216 * Put reply buffers back into pool when not attached to
1217 * request. This happens in error conditions, and when
1218 * aborting unbinds. Pre-decrement counter/array index.
1219 */
1220void
1221rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1222{
1223 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1224 unsigned long flags;
1225
1226 rep->rr_func = NULL;
1227 spin_lock_irqsave(&buffers->rb_lock, flags);
1228 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1229 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1230}
1231
1232/*
1233 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1234 */
1235
1236int
1237rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1238 struct ib_mr **mrp, struct ib_sge *iov)
1239{
1240 struct ib_phys_buf ipb;
1241 struct ib_mr *mr;
1242 int rc;
1243
1244 /*
1245 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1246 */
1247 iov->addr = ib_dma_map_single(ia->ri_id->device,
1248 va, len, DMA_BIDIRECTIONAL);
1249 iov->length = len;
1250
1251 if (ia->ri_bind_mem != NULL) {
1252 *mrp = NULL;
1253 iov->lkey = ia->ri_bind_mem->lkey;
1254 return 0;
1255 }
1256
1257 ipb.addr = iov->addr;
1258 ipb.size = iov->length;
1259 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1260 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1261
1262 dprintk("RPC: %s: phys convert: 0x%llx "
1263 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001264 __func__, (unsigned long long)ipb.addr,
1265 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001266
1267 if (IS_ERR(mr)) {
1268 *mrp = NULL;
1269 rc = PTR_ERR(mr);
1270 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1271 } else {
1272 *mrp = mr;
1273 iov->lkey = mr->lkey;
1274 rc = 0;
1275 }
1276
1277 return rc;
1278}
1279
1280int
1281rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1282 struct ib_mr *mr, struct ib_sge *iov)
1283{
1284 int rc;
1285
1286 ib_dma_unmap_single(ia->ri_id->device,
1287 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1288
1289 if (NULL == mr)
1290 return 0;
1291
1292 rc = ib_dereg_mr(mr);
1293 if (rc)
1294 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1295 return rc;
1296}
1297
1298/*
1299 * Wrappers for chunk registration, shared by read/write chunk code.
1300 */
1301
1302static void
1303rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1304{
1305 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1306 seg->mr_dmalen = seg->mr_len;
1307 if (seg->mr_page)
1308 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1309 seg->mr_page, offset_in_page(seg->mr_offset),
1310 seg->mr_dmalen, seg->mr_dir);
1311 else
1312 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1313 seg->mr_offset,
1314 seg->mr_dmalen, seg->mr_dir);
1315}
1316
1317static void
1318rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1319{
1320 if (seg->mr_page)
1321 ib_dma_unmap_page(ia->ri_id->device,
1322 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1323 else
1324 ib_dma_unmap_single(ia->ri_id->device,
1325 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1326}
1327
Tom Talpey8d4ba032008-10-09 14:59:49 -04001328static int
1329rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1330 int *nsegs, int writing, struct rpcrdma_ia *ia)
1331{
1332 struct rpcrdma_mr_seg *seg1 = seg;
1333 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1334 int len, pageoff, i, rc;
1335
1336 pageoff = offset_in_page(seg1->mr_offset);
1337 seg1->mr_offset -= pageoff; /* start of page */
1338 seg1->mr_len += pageoff;
1339 len = -pageoff;
1340 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1341 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1342 for (i = 0; i < *nsegs;) {
1343 rpcrdma_map_one(ia, seg, writing);
1344 physaddrs[i] = seg->mr_dma;
1345 len += seg->mr_len;
1346 ++seg;
1347 ++i;
1348 /* Check for holes */
1349 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1350 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1351 break;
1352 }
1353 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1354 physaddrs, i, seg1->mr_dma);
1355 if (rc) {
1356 dprintk("RPC: %s: failed ib_map_phys_fmr "
1357 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1358 len, (unsigned long long)seg1->mr_dma,
1359 pageoff, i, rc);
1360 while (i--)
1361 rpcrdma_unmap_one(ia, --seg);
1362 } else {
1363 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1364 seg1->mr_base = seg1->mr_dma + pageoff;
1365 seg1->mr_nsegs = i;
1366 seg1->mr_len = len;
1367 }
1368 *nsegs = i;
1369 return rc;
1370}
1371
1372static int
1373rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1374 struct rpcrdma_ia *ia)
1375{
1376 struct rpcrdma_mr_seg *seg1 = seg;
1377 LIST_HEAD(l);
1378 int rc;
1379
1380 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1381 rc = ib_unmap_fmr(&l);
1382 while (seg1->mr_nsegs--)
1383 rpcrdma_unmap_one(ia, seg++);
1384 if (rc)
1385 dprintk("RPC: %s: failed ib_unmap_fmr,"
1386 " status %i\n", __func__, rc);
1387 return rc;
1388}
1389
1390static int
1391rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1392 int *nsegs, int writing, struct rpcrdma_ia *ia,
1393 struct rpcrdma_xprt *r_xprt)
1394{
1395 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1396 IB_ACCESS_REMOTE_READ);
1397 struct ib_mw_bind param;
1398 int rc;
1399
1400 *nsegs = 1;
1401 rpcrdma_map_one(ia, seg, writing);
1402 param.mr = ia->ri_bind_mem;
1403 param.wr_id = 0ULL; /* no send cookie */
1404 param.addr = seg->mr_dma;
1405 param.length = seg->mr_len;
1406 param.send_flags = 0;
1407 param.mw_access_flags = mem_priv;
1408
1409 DECR_CQCOUNT(&r_xprt->rx_ep);
1410 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1411 if (rc) {
1412 dprintk("RPC: %s: failed ib_bind_mw "
1413 "%u@0x%llx status %i\n",
1414 __func__, seg->mr_len,
1415 (unsigned long long)seg->mr_dma, rc);
1416 rpcrdma_unmap_one(ia, seg);
1417 } else {
1418 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1419 seg->mr_base = param.addr;
1420 seg->mr_nsegs = 1;
1421 }
1422 return rc;
1423}
1424
1425static int
1426rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1427 struct rpcrdma_ia *ia,
1428 struct rpcrdma_xprt *r_xprt, void **r)
1429{
1430 struct ib_mw_bind param;
1431 LIST_HEAD(l);
1432 int rc;
1433
1434 BUG_ON(seg->mr_nsegs != 1);
1435 param.mr = ia->ri_bind_mem;
1436 param.addr = 0ULL; /* unbind */
1437 param.length = 0;
1438 param.mw_access_flags = 0;
1439 if (*r) {
1440 param.wr_id = (u64) (unsigned long) *r;
1441 param.send_flags = IB_SEND_SIGNALED;
1442 INIT_CQCOUNT(&r_xprt->rx_ep);
1443 } else {
1444 param.wr_id = 0ULL;
1445 param.send_flags = 0;
1446 DECR_CQCOUNT(&r_xprt->rx_ep);
1447 }
1448 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1449 rpcrdma_unmap_one(ia, seg);
1450 if (rc)
1451 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1452 " status %i\n", __func__, rc);
1453 else
1454 *r = NULL; /* will upcall on completion */
1455 return rc;
1456}
1457
1458static int
1459rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1460 int *nsegs, int writing, struct rpcrdma_ia *ia)
1461{
1462 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1463 IB_ACCESS_REMOTE_READ);
1464 struct rpcrdma_mr_seg *seg1 = seg;
1465 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1466 int len, i, rc = 0;
1467
1468 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1469 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1470 for (len = 0, i = 0; i < *nsegs;) {
1471 rpcrdma_map_one(ia, seg, writing);
1472 ipb[i].addr = seg->mr_dma;
1473 ipb[i].size = seg->mr_len;
1474 len += seg->mr_len;
1475 ++seg;
1476 ++i;
1477 /* Check for holes */
1478 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1479 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1480 break;
1481 }
1482 seg1->mr_base = seg1->mr_dma;
1483 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1484 ipb, i, mem_priv, &seg1->mr_base);
1485 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1486 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1487 dprintk("RPC: %s: failed ib_reg_phys_mr "
1488 "%u@0x%llx (%d)... status %i\n",
1489 __func__, len,
1490 (unsigned long long)seg1->mr_dma, i, rc);
1491 while (i--)
1492 rpcrdma_unmap_one(ia, --seg);
1493 } else {
1494 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1495 seg1->mr_nsegs = i;
1496 seg1->mr_len = len;
1497 }
1498 *nsegs = i;
1499 return rc;
1500}
1501
1502static int
1503rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1504 struct rpcrdma_ia *ia)
1505{
1506 struct rpcrdma_mr_seg *seg1 = seg;
1507 int rc;
1508
1509 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1510 seg1->mr_chunk.rl_mr = NULL;
1511 while (seg1->mr_nsegs--)
1512 rpcrdma_unmap_one(ia, seg++);
1513 if (rc)
1514 dprintk("RPC: %s: failed ib_dereg_mr,"
1515 " status %i\n", __func__, rc);
1516 return rc;
1517}
1518
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001519int
1520rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1521 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1522{
1523 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001524 int rc = 0;
1525
1526 switch (ia->ri_memreg_strategy) {
1527
1528#if RPCRDMA_PERSISTENT_REGISTRATION
1529 case RPCRDMA_ALLPHYSICAL:
1530 rpcrdma_map_one(ia, seg, writing);
1531 seg->mr_rkey = ia->ri_bind_mem->rkey;
1532 seg->mr_base = seg->mr_dma;
1533 seg->mr_nsegs = 1;
1534 nsegs = 1;
1535 break;
1536#endif
1537
Tom Talpey8d4ba032008-10-09 14:59:49 -04001538 /* Registration using fmr memory registration */
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001539 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001540 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001541 break;
1542
1543 /* Registration using memory windows */
1544 case RPCRDMA_MEMWINDOWS_ASYNC:
1545 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001546 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001547 break;
1548
1549 /* Default registration each time */
1550 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001551 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001552 break;
1553 }
1554 if (rc)
1555 return -1;
1556
1557 return nsegs;
1558}
1559
1560int
1561rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1562 struct rpcrdma_xprt *r_xprt, void *r)
1563{
1564 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001565 int nsegs = seg->mr_nsegs, rc;
1566
1567 switch (ia->ri_memreg_strategy) {
1568
1569#if RPCRDMA_PERSISTENT_REGISTRATION
1570 case RPCRDMA_ALLPHYSICAL:
1571 BUG_ON(nsegs != 1);
1572 rpcrdma_unmap_one(ia, seg);
1573 rc = 0;
1574 break;
1575#endif
1576
1577 case RPCRDMA_MTHCAFMR:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001578 rc = rpcrdma_deregister_fmr_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001579 break;
1580
1581 case RPCRDMA_MEMWINDOWS_ASYNC:
1582 case RPCRDMA_MEMWINDOWS:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001583 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001584 break;
1585
1586 default:
Tom Talpey8d4ba032008-10-09 14:59:49 -04001587 rc = rpcrdma_deregister_default_external(seg, ia);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001588 break;
1589 }
1590 if (r) {
1591 struct rpcrdma_rep *rep = r;
1592 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1593 rep->rr_func = NULL;
1594 func(rep); /* dereg done, callback now */
1595 }
1596 return nsegs;
1597}
1598
1599/*
1600 * Prepost any receive buffer, then post send.
1601 *
1602 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1603 */
1604int
1605rpcrdma_ep_post(struct rpcrdma_ia *ia,
1606 struct rpcrdma_ep *ep,
1607 struct rpcrdma_req *req)
1608{
1609 struct ib_send_wr send_wr, *send_wr_fail;
1610 struct rpcrdma_rep *rep = req->rl_reply;
1611 int rc;
1612
1613 if (rep) {
1614 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1615 if (rc)
1616 goto out;
1617 req->rl_reply = NULL;
1618 }
1619
1620 send_wr.next = NULL;
1621 send_wr.wr_id = 0ULL; /* no send cookie */
1622 send_wr.sg_list = req->rl_send_iov;
1623 send_wr.num_sge = req->rl_niovs;
1624 send_wr.opcode = IB_WR_SEND;
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001625 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1626 ib_dma_sync_single_for_device(ia->ri_id->device,
1627 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1628 DMA_TO_DEVICE);
1629 ib_dma_sync_single_for_device(ia->ri_id->device,
1630 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1631 DMA_TO_DEVICE);
1632 ib_dma_sync_single_for_device(ia->ri_id->device,
1633 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1634 DMA_TO_DEVICE);
1635
1636 if (DECR_CQCOUNT(ep) > 0)
1637 send_wr.send_flags = 0;
1638 else { /* Provider must take a send completion every now and then */
1639 INIT_CQCOUNT(ep);
1640 send_wr.send_flags = IB_SEND_SIGNALED;
1641 }
1642
1643 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1644 if (rc)
1645 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1646 rc);
1647out:
1648 return rc;
1649}
1650
1651/*
1652 * (Re)post a receive buffer.
1653 */
1654int
1655rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1656 struct rpcrdma_ep *ep,
1657 struct rpcrdma_rep *rep)
1658{
1659 struct ib_recv_wr recv_wr, *recv_wr_fail;
1660 int rc;
1661
1662 recv_wr.next = NULL;
1663 recv_wr.wr_id = (u64) (unsigned long) rep;
1664 recv_wr.sg_list = &rep->rr_iov;
1665 recv_wr.num_sge = 1;
1666
1667 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1668 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1669
1670 DECR_CQCOUNT(ep);
1671 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1672
1673 if (rc)
1674 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1675 rc);
1676 return rc;
1677}