blob: 44b0fb942e8db5834ddaebf81fe4fc1e3fbd5530 [file] [log] [blame]
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -04001/*
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04002 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040038 */
39
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040040/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50#include <linux/pci.h> /* for Tavor hack below */
51
\"Talpey, Thomas\f58851e2007-09-10 13:50:12 -040052#include "xprt_rdma.h"
53
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -040054/*
55 * Globals/Macros
56 */
57
58#ifdef RPC_DEBUG
59# define RPCDBG_FACILITY RPCDBG_TRANS
60#endif
61
62/*
63 * internal functions
64 */
65
66/*
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
70 */
71
72static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73static LIST_HEAD(rpcrdma_tasklets_g);
74
75static void
76rpcrdma_run_tasklet(unsigned long data)
77{
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
81
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
96
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98 }
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100}
101
102static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104static inline void
105rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106{
107 unsigned long flags;
108
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
113}
114
115static void
116rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117{
118 struct rpcrdma_ep *ep = context;
119
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
126 }
127}
128
129static void
130rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131{
132 struct rpcrdma_ep *ep = context;
133
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
140 }
141}
142
143static inline
144void rpcrdma_event_process(struct ib_wc *wc)
145{
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
154
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
162 }
163
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
186 }
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
188 }
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
197 }
198}
199
200static inline int
201rpcrdma_cq_poll(struct ib_cq *cq)
202{
203 struct ib_wc wc;
204 int rc;
205
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
212 }
213 if (rc == 0)
214 break;
215
216 rpcrdma_event_process(&wc);
217 }
218
219 return 0;
220}
221
222/*
223 * rpcrdma_cq_event_upcall
224 *
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
228 *
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
234 *
235 * Note that send events are suppressed and do not result in an upcall.
236 */
237static void
238rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239{
240 int rc;
241
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
245
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
251 }
252
253 rpcrdma_cq_poll(cq);
254}
255
256#ifdef RPC_DEBUG
257static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
270};
271#endif
272
273static int
274rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275{
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
283
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
346 }
347
348 return 0;
349}
350
351static struct rdma_cm_id *
352rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
354{
355 struct rdma_cm_id *id;
356 int rc;
357
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
364 }
365
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
372 }
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
377
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
384 }
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
389
390 return id;
391
392out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
395}
396
397/*
398 * Drain any cq, prior to teardown.
399 */
400static void
401rpcrdma_clean_cq(struct ib_cq *cq)
402{
403 struct ib_wc wc;
404 int count = 0;
405
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
408
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
412}
413
414/*
415 * Exported functions.
416 */
417
418/*
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
422 */
423int
424rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425{
426 int rc;
427 struct rpcrdma_ia *ia = &xprt->rx_ia;
428
429 init_completion(&ia->ri_done);
430
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id);
434 goto out1;
435 }
436
437 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438 if (IS_ERR(ia->ri_pd)) {
439 rc = PTR_ERR(ia->ri_pd);
440 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
441 __func__, rc);
442 goto out2;
443 }
444
445 /*
446 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding
449 * for the specific bytes targeted during each RPC operation, and
450 * revoked after the corresponding completion similar to a storage
451 * adapter.
452 */
453 if (memreg > RPCRDMA_REGISTER) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455 switch (memreg) {
456#if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE;
459 mem_priv |= IB_ACCESS_REMOTE_READ;
460 break;
461#endif
462 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND;
465 break;
466 default:
467 break;
468 }
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for "
472 "phys register failed with %lX\n\t"
473 "Will continue with degraded performance\n",
474 __func__, PTR_ERR(ia->ri_bind_mem));
475 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL;
477 }
478 }
479
480 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg;
482
483 return 0;
484out2:
485 rdma_destroy_id(ia->ri_id);
486out1:
487 return rc;
488}
489
490/*
491 * Clean up/close an IA.
492 * o if event handles and PD have been initialized, free them.
493 * o close the IA
494 */
495void
496rpcrdma_ia_close(struct rpcrdma_ia *ia)
497{
498 int rc;
499
500 dprintk("RPC: %s: entering\n", __func__);
501 if (ia->ri_bind_mem != NULL) {
502 rc = ib_dereg_mr(ia->ri_bind_mem);
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc);
505 }
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507 rdma_destroy_qp(ia->ri_id);
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc);
512 }
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
515}
516
517/*
518 * Create unconnected endpoint.
519 */
520int
521rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 struct rpcrdma_create_data_internal *cdata)
523{
524 struct ib_device_attr devattr;
525 int rc;
526
527 rc = ib_query_device(ia->ri_id->device, &devattr);
528 if (rc) {
529 dprintk("RPC: %s: ib_query_device failed %d\n",
530 __func__, rc);
531 return rc;
532 }
533
534 /* check provider's send/recv wr limits */
535 if (cdata->max_requests > devattr.max_qp_wr)
536 cdata->max_requests = devattr.max_qp_wr;
537
538 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539 ep->rep_attr.qp_context = ep;
540 /* send_cq and recv_cq initialized below */
541 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) {
544 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */
547 ep->rep_attr.cap.max_send_wr++;
548 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550 return -EINVAL;
551 break;
552 default:
553 break;
554 }
555 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557 ep->rep_attr.cap.max_recv_sge = 1;
558 ep->rep_attr.cap.max_inline_data = 0;
559 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560 ep->rep_attr.qp_type = IB_QPT_RC;
561 ep->rep_attr.port_num = ~0;
562
563 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
564 "iovs: send %d recv %d\n",
565 __func__,
566 ep->rep_attr.cap.max_send_wr,
567 ep->rep_attr.cap.max_recv_wr,
568 ep->rep_attr.cap.max_send_sge,
569 ep->rep_attr.cap.max_recv_sge);
570
571 /* set trigger for requesting send completion */
572 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
573 switch (ia->ri_memreg_strategy) {
574 case RPCRDMA_MEMWINDOWS_ASYNC:
575 case RPCRDMA_MEMWINDOWS:
576 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577 break;
578 default:
579 break;
580 }
581 if (ep->rep_cqinit <= 2)
582 ep->rep_cqinit = 0;
583 INIT_CQCOUNT(ep);
584 ep->rep_ia = ia;
585 init_waitqueue_head(&ep->rep_connect_wait);
586
587 /*
588 * Create a single cq for receive dto and mw_bind (only ever
589 * care about unbind, really). Send completions are suppressed.
590 * Use single threaded tasklet upcalls to maintain ordering.
591 */
592 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593 rpcrdma_cq_async_error_upcall, NULL,
594 ep->rep_attr.cap.max_recv_wr +
595 ep->rep_attr.cap.max_send_wr + 1, 0);
596 if (IS_ERR(ep->rep_cq)) {
597 rc = PTR_ERR(ep->rep_cq);
598 dprintk("RPC: %s: ib_create_cq failed: %i\n",
599 __func__, rc);
600 goto out1;
601 }
602
603 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604 if (rc) {
605 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
606 __func__, rc);
607 goto out2;
608 }
609
610 ep->rep_attr.send_cq = ep->rep_cq;
611 ep->rep_attr.recv_cq = ep->rep_cq;
612
613 /* Initialize cma parameters */
614
615 /* RPC/RDMA does not use private data */
616 ep->rep_remote_cma.private_data = NULL;
617 ep->rep_remote_cma.private_data_len = 0;
618
619 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) {
621 case RPCRDMA_BOUNCEBUFFERS:
622 ep->rep_remote_cma.responder_resources = 0;
623 break;
624 case RPCRDMA_MTHCAFMR:
625 case RPCRDMA_REGISTER:
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631#if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633#endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
639 }
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
643
644 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0;
646 ep->rep_remote_cma.rnr_retry_count = 0;
647
648 return 0;
649
650out2:
651 if (ib_destroy_cq(ep->rep_cq))
652 ;
653out1:
654 return rc;
655}
656
657/*
658 * rpcrdma_ep_destroy
659 *
660 * Disconnect and destroy endpoint. After this, the only
661 * valid operations on the ep are to free it (if dynamically
662 * allocated) or re-create it.
663 *
664 * The caller's error handling must be sure to not leak the endpoint
665 * if this function fails.
666 */
667int
668rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
669{
670 int rc;
671
672 dprintk("RPC: %s: entering, connected is %d\n",
673 __func__, ep->rep_connected);
674
675 if (ia->ri_id->qp) {
676 rc = rpcrdma_ep_disconnect(ep, ia);
677 if (rc)
678 dprintk("RPC: %s: rpcrdma_ep_disconnect"
679 " returned %i\n", __func__, rc);
680 }
681
682 ep->rep_func = NULL;
683
684 /* padding - could be done in rpcrdma_buffer_destroy... */
685 if (ep->rep_pad_mr) {
686 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
687 ep->rep_pad_mr = NULL;
688 }
689
690 if (ia->ri_id->qp) {
691 rdma_destroy_qp(ia->ri_id);
692 ia->ri_id->qp = NULL;
693 }
694
695 rpcrdma_clean_cq(ep->rep_cq);
696 rc = ib_destroy_cq(ep->rep_cq);
697 if (rc)
698 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
699 __func__, rc);
700
701 return rc;
702}
703
704/*
705 * Connect unconnected endpoint.
706 */
707int
708rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
709{
710 struct rdma_cm_id *id;
711 int rc = 0;
712 int retry_count = 0;
713 int reconnect = (ep->rep_connected != 0);
714
715 if (reconnect) {
716 struct rpcrdma_xprt *xprt;
717retry:
718 rc = rpcrdma_ep_disconnect(ep, ia);
719 if (rc && rc != -ENOTCONN)
720 dprintk("RPC: %s: rpcrdma_ep_disconnect"
721 " status %i\n", __func__, rc);
722 rpcrdma_clean_cq(ep->rep_cq);
723
724 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
725 id = rpcrdma_create_id(xprt, ia,
726 (struct sockaddr *)&xprt->rx_data.addr);
727 if (IS_ERR(id)) {
728 rc = PTR_ERR(id);
729 goto out;
730 }
731 /* TEMP TEMP TEMP - fail if new device:
732 * Deregister/remarshal *all* requests!
733 * Close and recreate adapter, pd, etc!
734 * Re-determine all attributes still sane!
735 * More stuff I haven't thought of!
736 * Rrrgh!
737 */
738 if (ia->ri_id->device != id->device) {
739 printk("RPC: %s: can't reconnect on "
740 "different device!\n", __func__);
741 rdma_destroy_id(id);
742 rc = -ENETDOWN;
743 goto out;
744 }
745 /* END TEMP */
746 rdma_destroy_id(ia->ri_id);
747 ia->ri_id = id;
748 }
749
750 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
751 if (rc) {
752 dprintk("RPC: %s: rdma_create_qp failed %i\n",
753 __func__, rc);
754 goto out;
755 }
756
757/* XXX Tavor device performs badly with 2K MTU! */
758if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
759 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
760 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
761 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
762 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
763 struct ib_qp_attr attr = {
764 .path_mtu = IB_MTU_1024
765 };
766 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
767 }
768}
769
770 /* Theoretically a client initiator_depth > 0 is not needed,
771 * but many peers fail to complete the connection unless they
772 * == responder_resources! */
773 if (ep->rep_remote_cma.initiator_depth !=
774 ep->rep_remote_cma.responder_resources)
775 ep->rep_remote_cma.initiator_depth =
776 ep->rep_remote_cma.responder_resources;
777
778 ep->rep_connected = 0;
779
780 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
781 if (rc) {
782 dprintk("RPC: %s: rdma_connect() failed with %i\n",
783 __func__, rc);
784 goto out;
785 }
786
787 if (reconnect)
788 return 0;
789
790 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
791
792 /*
793 * Check state. A non-peer reject indicates no listener
794 * (ECONNREFUSED), which may be a transient state. All
795 * others indicate a transport condition which has already
796 * undergone a best-effort.
797 */
798 if (ep->rep_connected == -ECONNREFUSED
799 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
800 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
801 goto retry;
802 }
803 if (ep->rep_connected <= 0) {
804 /* Sometimes, the only way to reliably connect to remote
805 * CMs is to use same nonzero values for ORD and IRD. */
806 ep->rep_remote_cma.initiator_depth =
807 ep->rep_remote_cma.responder_resources;
808 if (ep->rep_remote_cma.initiator_depth == 0)
809 ++ep->rep_remote_cma.initiator_depth;
810 if (ep->rep_remote_cma.responder_resources == 0)
811 ++ep->rep_remote_cma.responder_resources;
812 if (retry_count++ == 0)
813 goto retry;
814 rc = ep->rep_connected;
815 } else {
816 dprintk("RPC: %s: connected\n", __func__);
817 }
818
819out:
820 if (rc)
821 ep->rep_connected = rc;
822 return rc;
823}
824
825/*
826 * rpcrdma_ep_disconnect
827 *
828 * This is separate from destroy to facilitate the ability
829 * to reconnect without recreating the endpoint.
830 *
831 * This call is not reentrant, and must not be made in parallel
832 * on the same endpoint.
833 */
834int
835rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
836{
837 int rc;
838
839 rpcrdma_clean_cq(ep->rep_cq);
840 rc = rdma_disconnect(ia->ri_id);
841 if (!rc) {
842 /* returns without wait if not connected */
843 wait_event_interruptible(ep->rep_connect_wait,
844 ep->rep_connected != 1);
845 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
846 (ep->rep_connected == 1) ? "still " : "dis");
847 } else {
848 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
849 ep->rep_connected = rc;
850 }
851 return rc;
852}
853
854/*
855 * Initialize buffer memory
856 */
857int
858rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
859 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
860{
861 char *p;
862 size_t len;
863 int i, rc;
864
865 buf->rb_max_requests = cdata->max_requests;
866 spin_lock_init(&buf->rb_lock);
867 atomic_set(&buf->rb_credits, 1);
868
869 /* Need to allocate:
870 * 1. arrays for send and recv pointers
871 * 2. arrays of struct rpcrdma_req to fill in pointers
872 * 3. array of struct rpcrdma_rep for replies
873 * 4. padding, if any
874 * 5. mw's, if any
875 * Send/recv buffers in req/rep need to be registered
876 */
877
878 len = buf->rb_max_requests *
879 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
880 len += cdata->padding;
881 switch (ia->ri_memreg_strategy) {
882 case RPCRDMA_MTHCAFMR:
883 /* TBD we are perhaps overallocating here */
884 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
885 sizeof(struct rpcrdma_mw);
886 break;
887 case RPCRDMA_MEMWINDOWS_ASYNC:
888 case RPCRDMA_MEMWINDOWS:
889 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
890 sizeof(struct rpcrdma_mw);
891 break;
892 default:
893 break;
894 }
895
896 /* allocate 1, 4 and 5 in one shot */
897 p = kzalloc(len, GFP_KERNEL);
898 if (p == NULL) {
899 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
900 __func__, len);
901 rc = -ENOMEM;
902 goto out;
903 }
904 buf->rb_pool = p; /* for freeing it later */
905
906 buf->rb_send_bufs = (struct rpcrdma_req **) p;
907 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
908 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
909 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
910
911 /*
912 * Register the zeroed pad buffer, if any.
913 */
914 if (cdata->padding) {
915 rc = rpcrdma_register_internal(ia, p, cdata->padding,
916 &ep->rep_pad_mr, &ep->rep_pad);
917 if (rc)
918 goto out;
919 }
920 p += cdata->padding;
921
922 /*
923 * Allocate the fmr's, or mw's for mw_bind chunk registration.
924 * We "cycle" the mw's in order to minimize rkey reuse,
925 * and also reduce unbind-to-bind collision.
926 */
927 INIT_LIST_HEAD(&buf->rb_mws);
928 switch (ia->ri_memreg_strategy) {
929 case RPCRDMA_MTHCAFMR:
930 {
931 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
932 struct ib_fmr_attr fa = {
933 RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
934 };
935 /* TBD we are perhaps overallocating here */
936 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
937 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
938 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
939 &fa);
940 if (IS_ERR(r->r.fmr)) {
941 rc = PTR_ERR(r->r.fmr);
942 dprintk("RPC: %s: ib_alloc_fmr"
943 " failed %i\n", __func__, rc);
944 goto out;
945 }
946 list_add(&r->mw_list, &buf->rb_mws);
947 ++r;
948 }
949 }
950 break;
951 case RPCRDMA_MEMWINDOWS_ASYNC:
952 case RPCRDMA_MEMWINDOWS:
953 {
954 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
955 /* Allocate one extra request's worth, for full cycling */
956 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
957 r->r.mw = ib_alloc_mw(ia->ri_pd);
958 if (IS_ERR(r->r.mw)) {
959 rc = PTR_ERR(r->r.mw);
960 dprintk("RPC: %s: ib_alloc_mw"
961 " failed %i\n", __func__, rc);
962 goto out;
963 }
964 list_add(&r->mw_list, &buf->rb_mws);
965 ++r;
966 }
967 }
968 break;
969 default:
970 break;
971 }
972
973 /*
974 * Allocate/init the request/reply buffers. Doing this
975 * using kmalloc for now -- one for each buf.
976 */
977 for (i = 0; i < buf->rb_max_requests; i++) {
978 struct rpcrdma_req *req;
979 struct rpcrdma_rep *rep;
980
981 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
982 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
983 /* Typical ~2400b, so rounding up saves work later */
984 if (len < 4096)
985 len = 4096;
986 req = kmalloc(len, GFP_KERNEL);
987 if (req == NULL) {
988 dprintk("RPC: %s: request buffer %d alloc"
989 " failed\n", __func__, i);
990 rc = -ENOMEM;
991 goto out;
992 }
993 memset(req, 0, sizeof(struct rpcrdma_req));
994 buf->rb_send_bufs[i] = req;
995 buf->rb_send_bufs[i]->rl_buffer = buf;
996
997 rc = rpcrdma_register_internal(ia, req->rl_base,
998 len - offsetof(struct rpcrdma_req, rl_base),
999 &buf->rb_send_bufs[i]->rl_handle,
1000 &buf->rb_send_bufs[i]->rl_iov);
1001 if (rc)
1002 goto out;
1003
1004 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1005
1006 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1007 rep = kmalloc(len, GFP_KERNEL);
1008 if (rep == NULL) {
1009 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1010 __func__, i);
1011 rc = -ENOMEM;
1012 goto out;
1013 }
1014 memset(rep, 0, sizeof(struct rpcrdma_rep));
1015 buf->rb_recv_bufs[i] = rep;
1016 buf->rb_recv_bufs[i]->rr_buffer = buf;
1017 init_waitqueue_head(&rep->rr_unbind);
1018
1019 rc = rpcrdma_register_internal(ia, rep->rr_base,
1020 len - offsetof(struct rpcrdma_rep, rr_base),
1021 &buf->rb_recv_bufs[i]->rr_handle,
1022 &buf->rb_recv_bufs[i]->rr_iov);
1023 if (rc)
1024 goto out;
1025
1026 }
1027 dprintk("RPC: %s: max_requests %d\n",
1028 __func__, buf->rb_max_requests);
1029 /* done */
1030 return 0;
1031out:
1032 rpcrdma_buffer_destroy(buf);
1033 return rc;
1034}
1035
1036/*
1037 * Unregister and destroy buffer memory. Need to deal with
1038 * partial initialization, so it's callable from failed create.
1039 * Must be called before destroying endpoint, as registrations
1040 * reference it.
1041 */
1042void
1043rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1044{
1045 int rc, i;
1046 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1047
1048 /* clean up in reverse order from create
1049 * 1. recv mr memory (mr free, then kfree)
1050 * 1a. bind mw memory
1051 * 2. send mr memory (mr free, then kfree)
1052 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1053 * 4. arrays
1054 */
1055 dprintk("RPC: %s: entering\n", __func__);
1056
1057 for (i = 0; i < buf->rb_max_requests; i++) {
1058 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1059 rpcrdma_deregister_internal(ia,
1060 buf->rb_recv_bufs[i]->rr_handle,
1061 &buf->rb_recv_bufs[i]->rr_iov);
1062 kfree(buf->rb_recv_bufs[i]);
1063 }
1064 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1065 while (!list_empty(&buf->rb_mws)) {
1066 struct rpcrdma_mw *r;
1067 r = list_entry(buf->rb_mws.next,
1068 struct rpcrdma_mw, mw_list);
1069 list_del(&r->mw_list);
1070 switch (ia->ri_memreg_strategy) {
1071 case RPCRDMA_MTHCAFMR:
1072 rc = ib_dealloc_fmr(r->r.fmr);
1073 if (rc)
1074 dprintk("RPC: %s:"
1075 " ib_dealloc_fmr"
1076 " failed %i\n",
1077 __func__, rc);
1078 break;
1079 case RPCRDMA_MEMWINDOWS_ASYNC:
1080 case RPCRDMA_MEMWINDOWS:
1081 rc = ib_dealloc_mw(r->r.mw);
1082 if (rc)
1083 dprintk("RPC: %s:"
1084 " ib_dealloc_mw"
1085 " failed %i\n",
1086 __func__, rc);
1087 break;
1088 default:
1089 break;
1090 }
1091 }
1092 rpcrdma_deregister_internal(ia,
1093 buf->rb_send_bufs[i]->rl_handle,
1094 &buf->rb_send_bufs[i]->rl_iov);
1095 kfree(buf->rb_send_bufs[i]);
1096 }
1097 }
1098
1099 kfree(buf->rb_pool);
1100}
1101
1102/*
1103 * Get a set of request/reply buffers.
1104 *
1105 * Reply buffer (if needed) is attached to send buffer upon return.
1106 * Rule:
1107 * rb_send_index and rb_recv_index MUST always be pointing to the
1108 * *next* available buffer (non-NULL). They are incremented after
1109 * removing buffers, and decremented *before* returning them.
1110 */
1111struct rpcrdma_req *
1112rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1113{
1114 struct rpcrdma_req *req;
1115 unsigned long flags;
1116
1117 spin_lock_irqsave(&buffers->rb_lock, flags);
1118 if (buffers->rb_send_index == buffers->rb_max_requests) {
1119 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1120 dprintk("RPC: %s: out of request buffers\n", __func__);
1121 return ((struct rpcrdma_req *)NULL);
1122 }
1123
1124 req = buffers->rb_send_bufs[buffers->rb_send_index];
1125 if (buffers->rb_send_index < buffers->rb_recv_index) {
1126 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1127 __func__,
1128 buffers->rb_recv_index - buffers->rb_send_index);
1129 req->rl_reply = NULL;
1130 } else {
1131 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1132 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1133 }
1134 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1135 if (!list_empty(&buffers->rb_mws)) {
1136 int i = RPCRDMA_MAX_SEGS - 1;
1137 do {
1138 struct rpcrdma_mw *r;
1139 r = list_entry(buffers->rb_mws.next,
1140 struct rpcrdma_mw, mw_list);
1141 list_del(&r->mw_list);
1142 req->rl_segments[i].mr_chunk.rl_mw = r;
1143 } while (--i >= 0);
1144 }
1145 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1146 return req;
1147}
1148
1149/*
1150 * Put request/reply buffers back into pool.
1151 * Pre-decrement counter/array index.
1152 */
1153void
1154rpcrdma_buffer_put(struct rpcrdma_req *req)
1155{
1156 struct rpcrdma_buffer *buffers = req->rl_buffer;
1157 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1158 int i;
1159 unsigned long flags;
1160
1161 BUG_ON(req->rl_nchunks != 0);
1162 spin_lock_irqsave(&buffers->rb_lock, flags);
1163 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1164 req->rl_niovs = 0;
1165 if (req->rl_reply) {
1166 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1167 init_waitqueue_head(&req->rl_reply->rr_unbind);
1168 req->rl_reply->rr_func = NULL;
1169 req->rl_reply = NULL;
1170 }
1171 switch (ia->ri_memreg_strategy) {
1172 case RPCRDMA_MTHCAFMR:
1173 case RPCRDMA_MEMWINDOWS_ASYNC:
1174 case RPCRDMA_MEMWINDOWS:
1175 /*
1176 * Cycle mw's back in reverse order, and "spin" them.
1177 * This delays and scrambles reuse as much as possible.
1178 */
1179 i = 1;
1180 do {
1181 struct rpcrdma_mw **mw;
1182 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1183 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1184 *mw = NULL;
1185 } while (++i < RPCRDMA_MAX_SEGS);
1186 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1187 &buffers->rb_mws);
1188 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1189 break;
1190 default:
1191 break;
1192 }
1193 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1194}
1195
1196/*
1197 * Recover reply buffers from pool.
1198 * This happens when recovering from error conditions.
1199 * Post-increment counter/array index.
1200 */
1201void
1202rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1203{
1204 struct rpcrdma_buffer *buffers = req->rl_buffer;
1205 unsigned long flags;
1206
1207 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1208 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1209 spin_lock_irqsave(&buffers->rb_lock, flags);
1210 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1211 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1212 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1213 }
1214 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1215}
1216
1217/*
1218 * Put reply buffers back into pool when not attached to
1219 * request. This happens in error conditions, and when
1220 * aborting unbinds. Pre-decrement counter/array index.
1221 */
1222void
1223rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1224{
1225 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1226 unsigned long flags;
1227
1228 rep->rr_func = NULL;
1229 spin_lock_irqsave(&buffers->rb_lock, flags);
1230 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1231 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1232}
1233
1234/*
1235 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1236 */
1237
1238int
1239rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1240 struct ib_mr **mrp, struct ib_sge *iov)
1241{
1242 struct ib_phys_buf ipb;
1243 struct ib_mr *mr;
1244 int rc;
1245
1246 /*
1247 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1248 */
1249 iov->addr = ib_dma_map_single(ia->ri_id->device,
1250 va, len, DMA_BIDIRECTIONAL);
1251 iov->length = len;
1252
1253 if (ia->ri_bind_mem != NULL) {
1254 *mrp = NULL;
1255 iov->lkey = ia->ri_bind_mem->lkey;
1256 return 0;
1257 }
1258
1259 ipb.addr = iov->addr;
1260 ipb.size = iov->length;
1261 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1262 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1263
1264 dprintk("RPC: %s: phys convert: 0x%llx "
1265 "registered 0x%llx length %d\n",
Andrew Mortona56daeb2007-10-16 01:29:57 -07001266 __func__, (unsigned long long)ipb.addr,
1267 (unsigned long long)iov->addr, len);
\"Talpey, Thomas\c56c65f2007-09-10 13:51:18 -04001268
1269 if (IS_ERR(mr)) {
1270 *mrp = NULL;
1271 rc = PTR_ERR(mr);
1272 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1273 } else {
1274 *mrp = mr;
1275 iov->lkey = mr->lkey;
1276 rc = 0;
1277 }
1278
1279 return rc;
1280}
1281
1282int
1283rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1284 struct ib_mr *mr, struct ib_sge *iov)
1285{
1286 int rc;
1287
1288 ib_dma_unmap_single(ia->ri_id->device,
1289 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1290
1291 if (NULL == mr)
1292 return 0;
1293
1294 rc = ib_dereg_mr(mr);
1295 if (rc)
1296 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1297 return rc;
1298}
1299
1300/*
1301 * Wrappers for chunk registration, shared by read/write chunk code.
1302 */
1303
1304static void
1305rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1306{
1307 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1308 seg->mr_dmalen = seg->mr_len;
1309 if (seg->mr_page)
1310 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1311 seg->mr_page, offset_in_page(seg->mr_offset),
1312 seg->mr_dmalen, seg->mr_dir);
1313 else
1314 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1315 seg->mr_offset,
1316 seg->mr_dmalen, seg->mr_dir);
1317}
1318
1319static void
1320rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1321{
1322 if (seg->mr_page)
1323 ib_dma_unmap_page(ia->ri_id->device,
1324 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1325 else
1326 ib_dma_unmap_single(ia->ri_id->device,
1327 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1328}
1329
1330int
1331rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1332 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1333{
1334 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1335 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1336 IB_ACCESS_REMOTE_READ);
1337 struct rpcrdma_mr_seg *seg1 = seg;
1338 int i;
1339 int rc = 0;
1340
1341 switch (ia->ri_memreg_strategy) {
1342
1343#if RPCRDMA_PERSISTENT_REGISTRATION
1344 case RPCRDMA_ALLPHYSICAL:
1345 rpcrdma_map_one(ia, seg, writing);
1346 seg->mr_rkey = ia->ri_bind_mem->rkey;
1347 seg->mr_base = seg->mr_dma;
1348 seg->mr_nsegs = 1;
1349 nsegs = 1;
1350 break;
1351#endif
1352
1353 /* Registration using fast memory registration */
1354 case RPCRDMA_MTHCAFMR:
1355 {
1356 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1357 int len, pageoff = offset_in_page(seg->mr_offset);
1358 seg1->mr_offset -= pageoff; /* start of page */
1359 seg1->mr_len += pageoff;
1360 len = -pageoff;
1361 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1362 nsegs = RPCRDMA_MAX_DATA_SEGS;
1363 for (i = 0; i < nsegs;) {
1364 rpcrdma_map_one(ia, seg, writing);
1365 physaddrs[i] = seg->mr_dma;
1366 len += seg->mr_len;
1367 ++seg;
1368 ++i;
1369 /* Check for holes */
1370 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1371 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1372 break;
1373 }
1374 nsegs = i;
1375 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1376 physaddrs, nsegs, seg1->mr_dma);
1377 if (rc) {
1378 dprintk("RPC: %s: failed ib_map_phys_fmr "
1379 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1380 len, (unsigned long long)seg1->mr_dma,
1381 pageoff, nsegs, rc);
1382 while (nsegs--)
1383 rpcrdma_unmap_one(ia, --seg);
1384 } else {
1385 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1386 seg1->mr_base = seg1->mr_dma + pageoff;
1387 seg1->mr_nsegs = nsegs;
1388 seg1->mr_len = len;
1389 }
1390 }
1391 break;
1392
1393 /* Registration using memory windows */
1394 case RPCRDMA_MEMWINDOWS_ASYNC:
1395 case RPCRDMA_MEMWINDOWS:
1396 {
1397 struct ib_mw_bind param;
1398 rpcrdma_map_one(ia, seg, writing);
1399 param.mr = ia->ri_bind_mem;
1400 param.wr_id = 0ULL; /* no send cookie */
1401 param.addr = seg->mr_dma;
1402 param.length = seg->mr_len;
1403 param.send_flags = 0;
1404 param.mw_access_flags = mem_priv;
1405
1406 DECR_CQCOUNT(&r_xprt->rx_ep);
1407 rc = ib_bind_mw(ia->ri_id->qp,
1408 seg->mr_chunk.rl_mw->r.mw, &param);
1409 if (rc) {
1410 dprintk("RPC: %s: failed ib_bind_mw "
1411 "%u@0x%llx status %i\n",
1412 __func__, seg->mr_len,
1413 (unsigned long long)seg->mr_dma, rc);
1414 rpcrdma_unmap_one(ia, seg);
1415 } else {
1416 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1417 seg->mr_base = param.addr;
1418 seg->mr_nsegs = 1;
1419 nsegs = 1;
1420 }
1421 }
1422 break;
1423
1424 /* Default registration each time */
1425 default:
1426 {
1427 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1428 int len = 0;
1429 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1430 nsegs = RPCRDMA_MAX_DATA_SEGS;
1431 for (i = 0; i < nsegs;) {
1432 rpcrdma_map_one(ia, seg, writing);
1433 ipb[i].addr = seg->mr_dma;
1434 ipb[i].size = seg->mr_len;
1435 len += seg->mr_len;
1436 ++seg;
1437 ++i;
1438 /* Check for holes */
1439 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1440 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1441 break;
1442 }
1443 nsegs = i;
1444 seg1->mr_base = seg1->mr_dma;
1445 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1446 ipb, nsegs, mem_priv, &seg1->mr_base);
1447 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1448 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1449 dprintk("RPC: %s: failed ib_reg_phys_mr "
1450 "%u@0x%llx (%d)... status %i\n",
1451 __func__, len,
1452 (unsigned long long)seg1->mr_dma, nsegs, rc);
1453 while (nsegs--)
1454 rpcrdma_unmap_one(ia, --seg);
1455 } else {
1456 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1457 seg1->mr_nsegs = nsegs;
1458 seg1->mr_len = len;
1459 }
1460 }
1461 break;
1462 }
1463 if (rc)
1464 return -1;
1465
1466 return nsegs;
1467}
1468
1469int
1470rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1471 struct rpcrdma_xprt *r_xprt, void *r)
1472{
1473 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1474 struct rpcrdma_mr_seg *seg1 = seg;
1475 int nsegs = seg->mr_nsegs, rc;
1476
1477 switch (ia->ri_memreg_strategy) {
1478
1479#if RPCRDMA_PERSISTENT_REGISTRATION
1480 case RPCRDMA_ALLPHYSICAL:
1481 BUG_ON(nsegs != 1);
1482 rpcrdma_unmap_one(ia, seg);
1483 rc = 0;
1484 break;
1485#endif
1486
1487 case RPCRDMA_MTHCAFMR:
1488 {
1489 LIST_HEAD(l);
1490 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1491 rc = ib_unmap_fmr(&l);
1492 while (seg1->mr_nsegs--)
1493 rpcrdma_unmap_one(ia, seg++);
1494 }
1495 if (rc)
1496 dprintk("RPC: %s: failed ib_unmap_fmr,"
1497 " status %i\n", __func__, rc);
1498 break;
1499
1500 case RPCRDMA_MEMWINDOWS_ASYNC:
1501 case RPCRDMA_MEMWINDOWS:
1502 {
1503 struct ib_mw_bind param;
1504 BUG_ON(nsegs != 1);
1505 param.mr = ia->ri_bind_mem;
1506 param.addr = 0ULL; /* unbind */
1507 param.length = 0;
1508 param.mw_access_flags = 0;
1509 if (r) {
1510 param.wr_id = (u64) (unsigned long) r;
1511 param.send_flags = IB_SEND_SIGNALED;
1512 INIT_CQCOUNT(&r_xprt->rx_ep);
1513 } else {
1514 param.wr_id = 0ULL;
1515 param.send_flags = 0;
1516 DECR_CQCOUNT(&r_xprt->rx_ep);
1517 }
1518 rc = ib_bind_mw(ia->ri_id->qp,
1519 seg->mr_chunk.rl_mw->r.mw, &param);
1520 rpcrdma_unmap_one(ia, seg);
1521 }
1522 if (rc)
1523 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1524 " status %i\n", __func__, rc);
1525 else
1526 r = NULL; /* will upcall on completion */
1527 break;
1528
1529 default:
1530 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1531 seg1->mr_chunk.rl_mr = NULL;
1532 while (seg1->mr_nsegs--)
1533 rpcrdma_unmap_one(ia, seg++);
1534 if (rc)
1535 dprintk("RPC: %s: failed ib_dereg_mr,"
1536 " status %i\n", __func__, rc);
1537 break;
1538 }
1539 if (r) {
1540 struct rpcrdma_rep *rep = r;
1541 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1542 rep->rr_func = NULL;
1543 func(rep); /* dereg done, callback now */
1544 }
1545 return nsegs;
1546}
1547
1548/*
1549 * Prepost any receive buffer, then post send.
1550 *
1551 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1552 */
1553int
1554rpcrdma_ep_post(struct rpcrdma_ia *ia,
1555 struct rpcrdma_ep *ep,
1556 struct rpcrdma_req *req)
1557{
1558 struct ib_send_wr send_wr, *send_wr_fail;
1559 struct rpcrdma_rep *rep = req->rl_reply;
1560 int rc;
1561
1562 if (rep) {
1563 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1564 if (rc)
1565 goto out;
1566 req->rl_reply = NULL;
1567 }
1568
1569 send_wr.next = NULL;
1570 send_wr.wr_id = 0ULL; /* no send cookie */
1571 send_wr.sg_list = req->rl_send_iov;
1572 send_wr.num_sge = req->rl_niovs;
1573 send_wr.opcode = IB_WR_SEND;
1574 send_wr.imm_data = 0;
1575 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1576 ib_dma_sync_single_for_device(ia->ri_id->device,
1577 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1578 DMA_TO_DEVICE);
1579 ib_dma_sync_single_for_device(ia->ri_id->device,
1580 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1581 DMA_TO_DEVICE);
1582 ib_dma_sync_single_for_device(ia->ri_id->device,
1583 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1584 DMA_TO_DEVICE);
1585
1586 if (DECR_CQCOUNT(ep) > 0)
1587 send_wr.send_flags = 0;
1588 else { /* Provider must take a send completion every now and then */
1589 INIT_CQCOUNT(ep);
1590 send_wr.send_flags = IB_SEND_SIGNALED;
1591 }
1592
1593 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1594 if (rc)
1595 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1596 rc);
1597out:
1598 return rc;
1599}
1600
1601/*
1602 * (Re)post a receive buffer.
1603 */
1604int
1605rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1606 struct rpcrdma_ep *ep,
1607 struct rpcrdma_rep *rep)
1608{
1609 struct ib_recv_wr recv_wr, *recv_wr_fail;
1610 int rc;
1611
1612 recv_wr.next = NULL;
1613 recv_wr.wr_id = (u64) (unsigned long) rep;
1614 recv_wr.sg_list = &rep->rr_iov;
1615 recv_wr.num_sge = 1;
1616
1617 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1618 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1619
1620 DECR_CQCOUNT(ep);
1621 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1622
1623 if (rc)
1624 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1625 rc);
1626 return rc;
1627}