blob: 4bb32f1b953876fa09befa5abb41b31a1fb05d93 [file] [log] [blame]
Peng Taod7e09d02013-05-02 16:46:55 +08001/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19 *
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
22 * have any questions.
23 *
24 * GPL HEADER END
25 */
26/*
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
29 *
Andreas Dilger1dc563a2015-11-08 18:09:37 -050030 * Copyright (c) 2011, 2015, Intel Corporation.
Peng Taod7e09d02013-05-02 16:46:55 +080031 */
32/*
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
35 *
36 * lnet/klnds/o2iblnd/o2iblnd.c
37 *
38 * Author: Eric Barton <eric@bartonsoftware.com>
39 */
40
Peng Tao5f432642013-06-07 22:07:21 +080041#include <asm/div64.h>
John L. Hammondd664d1f2015-06-11 15:18:08 -040042#include <asm/page.h>
43#include "o2iblnd.h"
Peng Taod7e09d02013-05-02 16:46:55 +080044
Frank Zago439b4d42016-03-02 17:02:00 -050045static lnd_t the_o2iblnd;
Peng Taod7e09d02013-05-02 16:46:55 +080046
Mike Shueyec3d17c2015-05-19 10:14:36 -040047kib_data_t kiblnd_data;
Peng Taod7e09d02013-05-02 16:46:55 +080048
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020049static __u32 kiblnd_cksum(void *ptr, int nob)
Peng Taod7e09d02013-05-02 16:46:55 +080050{
Mike Shueyec3d17c2015-05-19 10:14:36 -040051 char *c = ptr;
52 __u32 sum = 0;
Peng Taod7e09d02013-05-02 16:46:55 +080053
54 while (nob-- > 0)
55 sum = ((sum << 1) | (sum >> 31)) + *c++;
56
57 /* ensure I don't return 0 (== no checksum) */
James Simmons5fd88332016-02-12 12:06:09 -050058 return !sum ? 1 : sum;
Peng Taod7e09d02013-05-02 16:46:55 +080059}
60
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020061static char *kiblnd_msgtype2str(int type)
Peng Taod7e09d02013-05-02 16:46:55 +080062{
63 switch (type) {
64 case IBLND_MSG_CONNREQ:
65 return "CONNREQ";
66
67 case IBLND_MSG_CONNACK:
68 return "CONNACK";
69
70 case IBLND_MSG_NOOP:
71 return "NOOP";
72
73 case IBLND_MSG_IMMEDIATE:
74 return "IMMEDIATE";
75
76 case IBLND_MSG_PUT_REQ:
77 return "PUT_REQ";
78
79 case IBLND_MSG_PUT_NAK:
80 return "PUT_NAK";
81
82 case IBLND_MSG_PUT_ACK:
83 return "PUT_ACK";
84
85 case IBLND_MSG_PUT_DONE:
86 return "PUT_DONE";
87
88 case IBLND_MSG_GET_REQ:
89 return "GET_REQ";
90
91 case IBLND_MSG_GET_DONE:
92 return "GET_DONE";
93
94 default:
95 return "???";
96 }
97}
98
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020099static int kiblnd_msgtype2size(int type)
Peng Taod7e09d02013-05-02 16:46:55 +0800100{
101 const int hdr_size = offsetof(kib_msg_t, ibm_u);
102
103 switch (type) {
104 case IBLND_MSG_CONNREQ:
105 case IBLND_MSG_CONNACK:
106 return hdr_size + sizeof(kib_connparams_t);
107
108 case IBLND_MSG_NOOP:
109 return hdr_size;
110
111 case IBLND_MSG_IMMEDIATE:
112 return offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]);
113
114 case IBLND_MSG_PUT_REQ:
115 return hdr_size + sizeof(kib_putreq_msg_t);
116
117 case IBLND_MSG_PUT_ACK:
118 return hdr_size + sizeof(kib_putack_msg_t);
119
120 case IBLND_MSG_GET_REQ:
121 return hdr_size + sizeof(kib_get_msg_t);
122
123 case IBLND_MSG_PUT_NAK:
124 case IBLND_MSG_PUT_DONE:
125 case IBLND_MSG_GET_DONE:
126 return hdr_size + sizeof(kib_completion_msg_t);
127 default:
128 return -1;
129 }
130}
131
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200132static int kiblnd_unpack_rd(kib_msg_t *msg, int flip)
Peng Taod7e09d02013-05-02 16:46:55 +0800133{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400134 kib_rdma_desc_t *rd;
135 int nob;
136 int n;
137 int i;
Peng Taod7e09d02013-05-02 16:46:55 +0800138
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200139 LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
James Simmonsc314c312016-02-12 12:06:01 -0500140 msg->ibm_type == IBLND_MSG_PUT_ACK);
Peng Taod7e09d02013-05-02 16:46:55 +0800141
142 rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
143 &msg->ibm_u.get.ibgm_rd :
144 &msg->ibm_u.putack.ibpam_rd;
145
146 if (flip) {
147 __swab32s(&rd->rd_key);
148 __swab32s(&rd->rd_nfrags);
149 }
150
151 n = rd->rd_nfrags;
152
153 if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
154 CERROR("Bad nfrags: %d, should be 0 < n <= %d\n",
155 n, IBLND_MAX_RDMA_FRAGS);
156 return 1;
157 }
158
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200159 nob = offsetof(kib_msg_t, ibm_u) +
Peng Taod7e09d02013-05-02 16:46:55 +0800160 kiblnd_rd_msg_size(rd, msg->ibm_type, n);
161
162 if (msg->ibm_nob < nob) {
163 CERROR("Short %s: %d(%d)\n",
164 kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
165 return 1;
166 }
167
168 if (!flip)
169 return 0;
170
171 for (i = 0; i < n; i++) {
172 __swab32s(&rd->rd_frags[i].rf_nob);
173 __swab64s(&rd->rd_frags[i].rf_addr);
174 }
175
176 return 0;
177}
178
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200179void kiblnd_pack_msg(lnet_ni_t *ni, kib_msg_t *msg, int version,
180 int credits, lnet_nid_t dstnid, __u64 dststamp)
Peng Taod7e09d02013-05-02 16:46:55 +0800181{
182 kib_net_t *net = ni->ni_data;
183
James Simmons4420cfd2016-02-12 12:06:00 -0500184 /*
185 * CAVEAT EMPTOR! all message fields not set here should have been
186 * initialised previously.
187 */
Peng Taod7e09d02013-05-02 16:46:55 +0800188 msg->ibm_magic = IBLND_MSG_MAGIC;
189 msg->ibm_version = version;
190 /* ibm_type */
191 msg->ibm_credits = credits;
192 /* ibm_nob */
193 msg->ibm_cksum = 0;
194 msg->ibm_srcnid = ni->ni_nid;
195 msg->ibm_srcstamp = net->ibn_incarnation;
196 msg->ibm_dstnid = dstnid;
197 msg->ibm_dststamp = dststamp;
198
199 if (*kiblnd_tunables.kib_cksum) {
200 /* NB ibm_cksum zero while computing cksum */
201 msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
202 }
203}
204
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200205int kiblnd_unpack_msg(kib_msg_t *msg, int nob)
Peng Taod7e09d02013-05-02 16:46:55 +0800206{
207 const int hdr_size = offsetof(kib_msg_t, ibm_u);
Mike Shueyec3d17c2015-05-19 10:14:36 -0400208 __u32 msg_cksum;
209 __u16 version;
210 int msg_nob;
211 int flip;
Peng Taod7e09d02013-05-02 16:46:55 +0800212
213 /* 6 bytes are enough to have received magic + version */
214 if (nob < 6) {
215 CERROR("Short message: %d\n", nob);
216 return -EPROTO;
217 }
218
219 if (msg->ibm_magic == IBLND_MSG_MAGIC) {
220 flip = 0;
221 } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
222 flip = 1;
223 } else {
224 CERROR("Bad magic: %08x\n", msg->ibm_magic);
225 return -EPROTO;
226 }
227
228 version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
229 if (version != IBLND_MSG_VERSION &&
230 version != IBLND_MSG_VERSION_1) {
231 CERROR("Bad version: %x\n", version);
232 return -EPROTO;
233 }
234
235 if (nob < hdr_size) {
236 CERROR("Short message: %d\n", nob);
237 return -EPROTO;
238 }
239
240 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
241 if (msg_nob > nob) {
242 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
243 return -EPROTO;
244 }
245
James Simmons4420cfd2016-02-12 12:06:00 -0500246 /*
247 * checksum must be computed with ibm_cksum zero and BEFORE anything
248 * gets flipped
249 */
Peng Taod7e09d02013-05-02 16:46:55 +0800250 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
251 msg->ibm_cksum = 0;
James Simmons5fd88332016-02-12 12:06:09 -0500252 if (msg_cksum &&
Peng Taod7e09d02013-05-02 16:46:55 +0800253 msg_cksum != kiblnd_cksum(msg, msg_nob)) {
254 CERROR("Bad checksum\n");
255 return -EPROTO;
256 }
257
258 msg->ibm_cksum = msg_cksum;
259
260 if (flip) {
261 /* leave magic unflipped as a clue to peer endianness */
262 msg->ibm_version = version;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200263 CLASSERT(sizeof(msg->ibm_type) == 1);
264 CLASSERT(sizeof(msg->ibm_credits) == 1);
Peng Taod7e09d02013-05-02 16:46:55 +0800265 msg->ibm_nob = msg_nob;
266 __swab64s(&msg->ibm_srcnid);
267 __swab64s(&msg->ibm_srcstamp);
268 __swab64s(&msg->ibm_dstnid);
269 __swab64s(&msg->ibm_dststamp);
270 }
271
272 if (msg->ibm_srcnid == LNET_NID_ANY) {
273 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
274 return -EPROTO;
275 }
276
277 if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
278 CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
279 msg_nob, kiblnd_msgtype2size(msg->ibm_type));
280 return -EPROTO;
281 }
282
283 switch (msg->ibm_type) {
284 default:
285 CERROR("Unknown message type %x\n", msg->ibm_type);
286 return -EPROTO;
287
288 case IBLND_MSG_NOOP:
289 case IBLND_MSG_IMMEDIATE:
290 case IBLND_MSG_PUT_REQ:
291 break;
292
293 case IBLND_MSG_PUT_ACK:
294 case IBLND_MSG_GET_REQ:
295 if (kiblnd_unpack_rd(msg, flip))
296 return -EPROTO;
297 break;
298
299 case IBLND_MSG_PUT_NAK:
300 case IBLND_MSG_PUT_DONE:
301 case IBLND_MSG_GET_DONE:
302 if (flip)
303 __swab32s(&msg->ibm_u.completion.ibcm_status);
304 break;
305
306 case IBLND_MSG_CONNREQ:
307 case IBLND_MSG_CONNACK:
308 if (flip) {
309 __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
310 __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
311 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
312 }
313 break;
314 }
315 return 0;
316}
317
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200318int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800319{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400320 kib_peer_t *peer;
321 kib_net_t *net = ni->ni_data;
322 int cpt = lnet_cpt_of_nid(nid);
323 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800324
James Simmons06ace262016-02-12 12:06:08 -0500325 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +0800326 LASSERT(nid != LNET_NID_ANY);
327
328 LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
James Simmons06ace262016-02-12 12:06:08 -0500329 if (!peer) {
Peng Taod7e09d02013-05-02 16:46:55 +0800330 CERROR("Cannot allocate peer\n");
331 return -ENOMEM;
332 }
333
Peng Taod7e09d02013-05-02 16:46:55 +0800334 peer->ibp_ni = ni;
335 peer->ibp_nid = nid;
336 peer->ibp_error = 0;
337 peer->ibp_last_alive = 0;
Amir Shehata9e7d5bf2016-05-06 21:30:25 -0400338 peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
Amir Shehataf6e50062016-05-06 21:30:27 -0400339 peer->ibp_queue_depth = ni->ni_peertxcredits;
Peng Taod7e09d02013-05-02 16:46:55 +0800340 atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
341
342 INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
343 INIT_LIST_HEAD(&peer->ibp_conns);
344 INIT_LIST_HEAD(&peer->ibp_tx_queue);
345
346 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
347
348 /* always called with a ref on ni, which prevents ni being shutdown */
James Simmons5fd88332016-02-12 12:06:09 -0500349 LASSERT(!net->ibn_shutdown);
Peng Taod7e09d02013-05-02 16:46:55 +0800350
351 /* npeers only grows with the global lock held */
352 atomic_inc(&net->ibn_npeers);
353
354 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
355
356 *peerp = peer;
357 return 0;
358}
359
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200360void kiblnd_destroy_peer(kib_peer_t *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800361{
362 kib_net_t *net = peer->ibp_ni->ni_data;
363
James Simmons06ace262016-02-12 12:06:08 -0500364 LASSERT(net);
James Simmons5fd88332016-02-12 12:06:09 -0500365 LASSERT(!atomic_read(&peer->ibp_refcount));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200366 LASSERT(!kiblnd_peer_active(peer));
Liang Zhen4d99b252016-03-02 18:53:29 -0500367 LASSERT(kiblnd_peer_idle(peer));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200368 LASSERT(list_empty(&peer->ibp_tx_queue));
Peng Taod7e09d02013-05-02 16:46:55 +0800369
370 LIBCFS_FREE(peer, sizeof(*peer));
371
James Simmons4420cfd2016-02-12 12:06:00 -0500372 /*
373 * NB a peer's connections keep a reference on their peer until
Peng Taod7e09d02013-05-02 16:46:55 +0800374 * they are destroyed, so we can be assured that _all_ state to do
375 * with this peer has been cleaned up when its refcount drops to
James Simmons4420cfd2016-02-12 12:06:00 -0500376 * zero.
377 */
Peng Taod7e09d02013-05-02 16:46:55 +0800378 atomic_dec(&net->ibn_npeers);
379}
380
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200381kib_peer_t *kiblnd_find_peer_locked(lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800382{
James Simmons4420cfd2016-02-12 12:06:00 -0500383 /*
384 * the caller is responsible for accounting the additional reference
385 * that this creates
386 */
Mike Shueyec3d17c2015-05-19 10:14:36 -0400387 struct list_head *peer_list = kiblnd_nid2peerlist(nid);
388 struct list_head *tmp;
389 kib_peer_t *peer;
Peng Taod7e09d02013-05-02 16:46:55 +0800390
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200391 list_for_each(tmp, peer_list) {
Peng Taod7e09d02013-05-02 16:46:55 +0800392 peer = list_entry(tmp, kib_peer_t, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500393 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800394
395 if (peer->ibp_nid != nid)
396 continue;
397
398 CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
399 peer, libcfs_nid2str(nid),
400 atomic_read(&peer->ibp_refcount),
401 peer->ibp_version);
402 return peer;
403 }
404 return NULL;
405}
406
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200407void kiblnd_unlink_peer_locked(kib_peer_t *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800408{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200409 LASSERT(list_empty(&peer->ibp_conns));
Peng Taod7e09d02013-05-02 16:46:55 +0800410
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200411 LASSERT(kiblnd_peer_active(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800412 list_del_init(&peer->ibp_list);
413 /* lose peerlist's ref */
414 kiblnd_peer_decref(peer);
415}
416
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200417static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
418 lnet_nid_t *nidp, int *count)
Peng Taod7e09d02013-05-02 16:46:55 +0800419{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400420 kib_peer_t *peer;
421 struct list_head *ptmp;
422 int i;
423 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800424
425 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
426
427 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200428 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +0800429 peer = list_entry(ptmp, kib_peer_t, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500430 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800431
432 if (peer->ibp_ni != ni)
433 continue;
434
435 if (index-- > 0)
436 continue;
437
438 *nidp = peer->ibp_nid;
439 *count = atomic_read(&peer->ibp_refcount);
440
441 read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
442 flags);
443 return 0;
444 }
445 }
446
447 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
448 return -ENOENT;
449}
450
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200451static void kiblnd_del_peer_locked(kib_peer_t *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800452{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400453 struct list_head *ctmp;
454 struct list_head *cnxt;
455 kib_conn_t *conn;
Peng Taod7e09d02013-05-02 16:46:55 +0800456
457 if (list_empty(&peer->ibp_conns)) {
458 kiblnd_unlink_peer_locked(peer);
459 } else {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200460 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
Peng Taod7e09d02013-05-02 16:46:55 +0800461 conn = list_entry(ctmp, kib_conn_t, ibc_list);
462
463 kiblnd_close_conn_locked(conn, 0);
464 }
465 /* NB closing peer's last conn unlinked it. */
466 }
James Simmons4420cfd2016-02-12 12:06:00 -0500467 /*
468 * NB peer now unlinked; might even be freed if the peer table had the
469 * last ref on it.
470 */
Peng Taod7e09d02013-05-02 16:46:55 +0800471}
472
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200473static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800474{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200475 LIST_HEAD(zombies);
Mike Shueyec3d17c2015-05-19 10:14:36 -0400476 struct list_head *ptmp;
477 struct list_head *pnxt;
478 kib_peer_t *peer;
479 int lo;
480 int hi;
481 int i;
482 unsigned long flags;
483 int rc = -ENOENT;
Peng Taod7e09d02013-05-02 16:46:55 +0800484
485 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
486
487 if (nid != LNET_NID_ANY) {
James Simmonsd3d3d372016-02-12 12:06:05 -0500488 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
489 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
Peng Taod7e09d02013-05-02 16:46:55 +0800490 } else {
491 lo = 0;
492 hi = kiblnd_data.kib_peer_hash_size - 1;
493 }
494
495 for (i = lo; i <= hi; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200496 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +0800497 peer = list_entry(ptmp, kib_peer_t, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500498 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800499
500 if (peer->ibp_ni != ni)
501 continue;
502
503 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
504 continue;
505
506 if (!list_empty(&peer->ibp_tx_queue)) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200507 LASSERT(list_empty(&peer->ibp_conns));
Peng Taod7e09d02013-05-02 16:46:55 +0800508
509 list_splice_init(&peer->ibp_tx_queue,
James Simmonsc314c312016-02-12 12:06:01 -0500510 &zombies);
Peng Taod7e09d02013-05-02 16:46:55 +0800511 }
512
513 kiblnd_del_peer_locked(peer);
514 rc = 0; /* matched something */
515 }
516 }
517
518 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
519
520 kiblnd_txlist_done(ni, &zombies, -EIO);
521
522 return rc;
523}
524
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200525static kib_conn_t *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
Peng Taod7e09d02013-05-02 16:46:55 +0800526{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400527 kib_peer_t *peer;
528 struct list_head *ptmp;
529 kib_conn_t *conn;
530 struct list_head *ctmp;
531 int i;
532 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800533
534 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
535
536 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200537 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +0800538 peer = list_entry(ptmp, kib_peer_t, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500539 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800540
541 if (peer->ibp_ni != ni)
542 continue;
543
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200544 list_for_each(ctmp, &peer->ibp_conns) {
Peng Taod7e09d02013-05-02 16:46:55 +0800545 if (index-- > 0)
546 continue;
547
548 conn = list_entry(ctmp, kib_conn_t,
James Simmonsc314c312016-02-12 12:06:01 -0500549 ibc_list);
Peng Taod7e09d02013-05-02 16:46:55 +0800550 kiblnd_conn_addref(conn);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +0200551 read_unlock_irqrestore(
552 &kiblnd_data.kib_global_lock,
553 flags);
Peng Taod7e09d02013-05-02 16:46:55 +0800554 return conn;
555 }
556 }
557 }
558
559 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
560 return NULL;
561}
562
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200563int kiblnd_translate_mtu(int value)
Peng Taod7e09d02013-05-02 16:46:55 +0800564{
565 switch (value) {
566 default:
567 return -1;
568 case 0:
569 return 0;
570 case 256:
571 return IB_MTU_256;
572 case 512:
573 return IB_MTU_512;
574 case 1024:
575 return IB_MTU_1024;
576 case 2048:
577 return IB_MTU_2048;
578 case 4096:
579 return IB_MTU_4096;
580 }
581}
582
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200583static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
Peng Taod7e09d02013-05-02 16:46:55 +0800584{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400585 int mtu;
Peng Taod7e09d02013-05-02 16:46:55 +0800586
587 /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
James Simmons06ace262016-02-12 12:06:08 -0500588 if (!cmid->route.path_rec)
Peng Taod7e09d02013-05-02 16:46:55 +0800589 return;
590
591 mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200592 LASSERT(mtu >= 0);
James Simmons5fd88332016-02-12 12:06:09 -0500593 if (mtu)
Peng Taod7e09d02013-05-02 16:46:55 +0800594 cmid->route.path_rec->mtu = mtu;
595}
596
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200597static int kiblnd_get_completion_vector(kib_conn_t *conn, int cpt)
Peng Taod7e09d02013-05-02 16:46:55 +0800598{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400599 cpumask_t *mask;
600 int vectors;
601 int off;
602 int i;
603 lnet_nid_t nid = conn->ibc_peer->ibp_nid;
Peng Taod7e09d02013-05-02 16:46:55 +0800604
605 vectors = conn->ibc_cmid->device->num_comp_vectors;
606 if (vectors <= 1)
607 return 0;
608
609 mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
James Simmons06ace262016-02-12 12:06:08 -0500610 if (!mask)
Peng Tao3867ea52013-07-15 22:27:10 +0800611 return 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800612
613 /* hash NID to CPU id in this partition... */
Oleg Drokin4a316f72015-03-07 19:24:27 -0500614 off = do_div(nid, cpumask_weight(mask));
615 for_each_cpu(i, mask) {
James Simmons5fd88332016-02-12 12:06:09 -0500616 if (!off--)
Peng Taod7e09d02013-05-02 16:46:55 +0800617 return i % vectors;
618 }
619
620 LBUG();
621 return 1;
622}
623
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200624kib_conn_t *kiblnd_create_conn(kib_peer_t *peer, struct rdma_cm_id *cmid,
Amir Shehataa01fa102016-03-02 18:53:25 -0500625 int state, int version)
Peng Taod7e09d02013-05-02 16:46:55 +0800626{
James Simmons4420cfd2016-02-12 12:06:00 -0500627 /*
628 * CAVEAT EMPTOR:
Peng Taod7e09d02013-05-02 16:46:55 +0800629 * If the new conn is created successfully it takes over the caller's
630 * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
631 * is destroyed. On failure, the caller's ref on 'peer' remains and
632 * she must dispose of 'cmid'. (Actually I'd block forever if I tried
633 * to destroy 'cmid' here since I'm called from the CM which still has
James Simmons4420cfd2016-02-12 12:06:00 -0500634 * its ref on 'cmid').
635 */
Mike Shueyec3d17c2015-05-19 10:14:36 -0400636 rwlock_t *glock = &kiblnd_data.kib_global_lock;
637 kib_net_t *net = peer->ibp_ni->ni_data;
638 kib_dev_t *dev;
Peng Taod7e09d02013-05-02 16:46:55 +0800639 struct ib_qp_init_attr *init_qp_attr;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400640 struct kib_sched_info *sched;
Linus Torvalds23908db2015-06-26 15:46:08 -0700641 struct ib_cq_init_attr cq_attr = {};
Mike Shueyec3d17c2015-05-19 10:14:36 -0400642 kib_conn_t *conn;
643 struct ib_cq *cq;
644 unsigned long flags;
645 int cpt;
646 int rc;
647 int i;
Peng Taod7e09d02013-05-02 16:46:55 +0800648
James Simmons06ace262016-02-12 12:06:08 -0500649 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +0800650 LASSERT(!in_interrupt());
651
652 dev = net->ibn_dev;
653
654 cpt = lnet_cpt_of_nid(peer->ibp_nid);
655 sched = kiblnd_data.kib_scheds[cpt];
656
657 LASSERT(sched->ibs_nthreads > 0);
658
659 LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
660 sizeof(*init_qp_attr));
James Simmons06ace262016-02-12 12:06:08 -0500661 if (!init_qp_attr) {
Peng Taod7e09d02013-05-02 16:46:55 +0800662 CERROR("Can't allocate qp_attr for %s\n",
663 libcfs_nid2str(peer->ibp_nid));
664 goto failed_0;
665 }
666
667 LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
James Simmons06ace262016-02-12 12:06:08 -0500668 if (!conn) {
Peng Taod7e09d02013-05-02 16:46:55 +0800669 CERROR("Can't allocate connection for %s\n",
670 libcfs_nid2str(peer->ibp_nid));
671 goto failed_1;
672 }
673
674 conn->ibc_state = IBLND_CONN_INIT;
675 conn->ibc_version = version;
676 conn->ibc_peer = peer; /* I take the caller's ref */
677 cmid->context = conn; /* for future CM callbacks */
678 conn->ibc_cmid = cmid;
Amir Shehataa01fa102016-03-02 18:53:25 -0500679 conn->ibc_max_frags = peer->ibp_max_frags;
680 conn->ibc_queue_depth = peer->ibp_queue_depth;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500681
Peng Taod7e09d02013-05-02 16:46:55 +0800682 INIT_LIST_HEAD(&conn->ibc_early_rxs);
683 INIT_LIST_HEAD(&conn->ibc_tx_noops);
684 INIT_LIST_HEAD(&conn->ibc_tx_queue);
685 INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
686 INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
687 INIT_LIST_HEAD(&conn->ibc_active_txs);
688 spin_lock_init(&conn->ibc_lock);
689
690 LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
691 sizeof(*conn->ibc_connvars));
James Simmons06ace262016-02-12 12:06:08 -0500692 if (!conn->ibc_connvars) {
Peng Taod7e09d02013-05-02 16:46:55 +0800693 CERROR("Can't allocate in-progress connection state\n");
694 goto failed_2;
695 }
696
697 write_lock_irqsave(glock, flags);
698 if (dev->ibd_failover) {
699 write_unlock_irqrestore(glock, flags);
700 CERROR("%s: failover in progress\n", dev->ibd_ifname);
701 goto failed_2;
702 }
703
704 if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
705 /* wakeup failover thread and teardown connection */
706 if (kiblnd_dev_can_failover(dev)) {
707 list_add_tail(&dev->ibd_fail_list,
708 &kiblnd_data.kib_failed_devs);
709 wake_up(&kiblnd_data.kib_failover_waitq);
710 }
711
712 write_unlock_irqrestore(glock, flags);
713 CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
714 cmid->device->name, dev->ibd_ifname);
715 goto failed_2;
716 }
717
718 kiblnd_hdev_addref_locked(dev->ibd_hdev);
719 conn->ibc_hdev = dev->ibd_hdev;
720
721 kiblnd_setup_mtu_locked(cmid);
722
723 write_unlock_irqrestore(glock, flags);
724
725 LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500726 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
James Simmons06ace262016-02-12 12:06:08 -0500727 if (!conn->ibc_rxs) {
Peng Taod7e09d02013-05-02 16:46:55 +0800728 CERROR("Cannot allocate RX buffers\n");
729 goto failed_2;
730 }
731
732 rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500733 IBLND_RX_MSG_PAGES(conn));
James Simmons5fd88332016-02-12 12:06:09 -0500734 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +0800735 goto failed_2;
736
737 kiblnd_map_rx_descs(conn);
738
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500739 cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
Matan Barak8e372102015-06-11 16:35:21 +0300740 cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
Peng Taod7e09d02013-05-02 16:46:55 +0800741 cq = ib_create_cq(cmid->device,
742 kiblnd_cq_completion, kiblnd_cq_event, conn,
Matan Barak8e372102015-06-11 16:35:21 +0300743 &cq_attr);
Peng Taod7e09d02013-05-02 16:46:55 +0800744 if (IS_ERR(cq)) {
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500745 CERROR("Failed to create CQ with %d CQEs: %ld\n",
746 IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
Peng Taod7e09d02013-05-02 16:46:55 +0800747 goto failed_2;
748 }
749
750 conn->ibc_cq = cq;
751
752 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
James Simmons5fd88332016-02-12 12:06:09 -0500753 if (rc) {
Frank Zago9c379662016-03-02 17:02:02 -0500754 CERROR("Can't request completion notification: %d\n", rc);
Peng Taod7e09d02013-05-02 16:46:55 +0800755 goto failed_2;
756 }
757
758 init_qp_attr->event_handler = kiblnd_qp_event;
759 init_qp_attr->qp_context = conn;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500760 init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
761 init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
Peng Taod7e09d02013-05-02 16:46:55 +0800762 init_qp_attr->cap.max_send_sge = 1;
763 init_qp_attr->cap.max_recv_sge = 1;
764 init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
765 init_qp_attr->qp_type = IB_QPT_RC;
766 init_qp_attr->send_cq = cq;
767 init_qp_attr->recv_cq = cq;
768
769 conn->ibc_sched = sched;
770
771 rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
James Simmons5fd88332016-02-12 12:06:09 -0500772 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +0800773 CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
774 rc, init_qp_attr->cap.max_send_wr,
775 init_qp_attr->cap.max_recv_wr);
776 goto failed_2;
777 }
778
779 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
780
781 /* 1 ref for caller and each rxmsg */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500782 atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
783 conn->ibc_nrx = IBLND_RX_MSGS(conn);
Peng Taod7e09d02013-05-02 16:46:55 +0800784
785 /* post receives */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500786 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +0800787 rc = kiblnd_post_rx(&conn->ibc_rxs[i],
788 IBLND_POSTRX_NO_CREDIT);
James Simmons5fd88332016-02-12 12:06:09 -0500789 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +0800790 CERROR("Can't post rxmsg: %d\n", rc);
791
792 /* Make posted receives complete */
793 kiblnd_abort_receives(conn);
794
James Simmons4420cfd2016-02-12 12:06:00 -0500795 /*
796 * correct # of posted buffers
797 * NB locking needed now I'm racing with completion
798 */
Peng Taod7e09d02013-05-02 16:46:55 +0800799 spin_lock_irqsave(&sched->ibs_lock, flags);
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500800 conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
Peng Taod7e09d02013-05-02 16:46:55 +0800801 spin_unlock_irqrestore(&sched->ibs_lock, flags);
802
James Simmons4420cfd2016-02-12 12:06:00 -0500803 /*
804 * cmid will be destroyed by CM(ofed) after cm_callback
Peng Taod7e09d02013-05-02 16:46:55 +0800805 * returned, so we can't refer it anymore
James Simmons4420cfd2016-02-12 12:06:00 -0500806 * (by kiblnd_connd()->kiblnd_destroy_conn)
807 */
Peng Taod7e09d02013-05-02 16:46:55 +0800808 rdma_destroy_qp(conn->ibc_cmid);
809 conn->ibc_cmid = NULL;
810
811 /* Drop my own and unused rxbuffer refcounts */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500812 while (i++ <= IBLND_RX_MSGS(conn))
Peng Taod7e09d02013-05-02 16:46:55 +0800813 kiblnd_conn_decref(conn);
814
815 return NULL;
816 }
817 }
818
819 /* Init successful! */
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200820 LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
James Simmonsc314c312016-02-12 12:06:01 -0500821 state == IBLND_CONN_PASSIVE_WAIT);
Peng Taod7e09d02013-05-02 16:46:55 +0800822 conn->ibc_state = state;
823
824 /* 1 more conn */
825 atomic_inc(&net->ibn_nconns);
826 return conn;
827
828 failed_2:
Liang Zhen4d99b252016-03-02 18:53:29 -0500829 kiblnd_destroy_conn(conn, true);
Peng Taod7e09d02013-05-02 16:46:55 +0800830 failed_1:
831 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
832 failed_0:
833 return NULL;
834}
835
Liang Zhen4d99b252016-03-02 18:53:29 -0500836void kiblnd_destroy_conn(kib_conn_t *conn, bool free_conn)
Peng Taod7e09d02013-05-02 16:46:55 +0800837{
838 struct rdma_cm_id *cmid = conn->ibc_cmid;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400839 kib_peer_t *peer = conn->ibc_peer;
840 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +0800841
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200842 LASSERT(!in_interrupt());
James Simmons5fd88332016-02-12 12:06:09 -0500843 LASSERT(!atomic_read(&conn->ibc_refcount));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200844 LASSERT(list_empty(&conn->ibc_early_rxs));
845 LASSERT(list_empty(&conn->ibc_tx_noops));
846 LASSERT(list_empty(&conn->ibc_tx_queue));
847 LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
848 LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
849 LASSERT(list_empty(&conn->ibc_active_txs));
James Simmons5fd88332016-02-12 12:06:09 -0500850 LASSERT(!conn->ibc_noops_posted);
851 LASSERT(!conn->ibc_nsends_posted);
Peng Taod7e09d02013-05-02 16:46:55 +0800852
853 switch (conn->ibc_state) {
854 default:
855 /* conn must be completely disengaged from the network */
856 LBUG();
857
858 case IBLND_CONN_DISCONNECTED:
859 /* connvars should have been freed already */
James Simmons06ace262016-02-12 12:06:08 -0500860 LASSERT(!conn->ibc_connvars);
Peng Taod7e09d02013-05-02 16:46:55 +0800861 break;
862
863 case IBLND_CONN_INIT:
864 break;
865 }
866
867 /* conn->ibc_cmid might be destroyed by CM already */
James Simmons06ace262016-02-12 12:06:08 -0500868 if (cmid && cmid->qp)
Peng Taod7e09d02013-05-02 16:46:55 +0800869 rdma_destroy_qp(cmid);
870
James Simmons06ace262016-02-12 12:06:08 -0500871 if (conn->ibc_cq) {
Peng Taod7e09d02013-05-02 16:46:55 +0800872 rc = ib_destroy_cq(conn->ibc_cq);
James Simmons5fd88332016-02-12 12:06:09 -0500873 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +0800874 CWARN("Error destroying CQ: %d\n", rc);
875 }
876
James Simmons06ace262016-02-12 12:06:08 -0500877 if (conn->ibc_rx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +0800878 kiblnd_unmap_rx_descs(conn);
879
James Simmons06ace262016-02-12 12:06:08 -0500880 if (conn->ibc_rxs) {
Peng Taod7e09d02013-05-02 16:46:55 +0800881 LIBCFS_FREE(conn->ibc_rxs,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500882 IBLND_RX_MSGS(conn) * sizeof(kib_rx_t));
Peng Taod7e09d02013-05-02 16:46:55 +0800883 }
884
James Simmons06ace262016-02-12 12:06:08 -0500885 if (conn->ibc_connvars)
Peng Taod7e09d02013-05-02 16:46:55 +0800886 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
887
James Simmons06ace262016-02-12 12:06:08 -0500888 if (conn->ibc_hdev)
Peng Taod7e09d02013-05-02 16:46:55 +0800889 kiblnd_hdev_decref(conn->ibc_hdev);
890
891 /* See CAVEAT EMPTOR above in kiblnd_create_conn */
892 if (conn->ibc_state != IBLND_CONN_INIT) {
893 kib_net_t *net = peer->ibp_ni->ni_data;
894
895 kiblnd_peer_decref(peer);
896 rdma_destroy_id(cmid);
897 atomic_dec(&net->ibn_nconns);
898 }
899
900 LIBCFS_FREE(conn, sizeof(*conn));
901}
902
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200903int kiblnd_close_peer_conns_locked(kib_peer_t *peer, int why)
Peng Taod7e09d02013-05-02 16:46:55 +0800904{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400905 kib_conn_t *conn;
906 struct list_head *ctmp;
907 struct list_head *cnxt;
908 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800909
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200910 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
Peng Taod7e09d02013-05-02 16:46:55 +0800911 conn = list_entry(ctmp, kib_conn_t, ibc_list);
912
Joe Perches2d00bd12014-11-23 11:28:50 -0800913 CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800914 libcfs_nid2str(peer->ibp_nid),
915 conn->ibc_version, why);
916
917 kiblnd_close_conn_locked(conn, why);
918 count++;
919 }
920
921 return count;
922}
923
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200924int kiblnd_close_stale_conns_locked(kib_peer_t *peer,
James Simmonsc314c312016-02-12 12:06:01 -0500925 int version, __u64 incarnation)
Peng Taod7e09d02013-05-02 16:46:55 +0800926{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400927 kib_conn_t *conn;
928 struct list_head *ctmp;
929 struct list_head *cnxt;
930 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800931
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200932 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
Peng Taod7e09d02013-05-02 16:46:55 +0800933 conn = list_entry(ctmp, kib_conn_t, ibc_list);
934
935 if (conn->ibc_version == version &&
936 conn->ibc_incarnation == incarnation)
937 continue;
938
Guillaume Matheron7a3888a2015-04-02 19:52:07 +0200939 CDEBUG(D_NET,
940 "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800941 libcfs_nid2str(peer->ibp_nid),
942 conn->ibc_version, conn->ibc_incarnation,
943 version, incarnation);
944
945 kiblnd_close_conn_locked(conn, -ESTALE);
946 count++;
947 }
948
949 return count;
950}
951
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200952static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800953{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400954 kib_peer_t *peer;
955 struct list_head *ptmp;
956 struct list_head *pnxt;
957 int lo;
958 int hi;
959 int i;
960 unsigned long flags;
961 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800962
963 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
964
James Simmonsd3d3d372016-02-12 12:06:05 -0500965 if (nid != LNET_NID_ANY) {
966 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
967 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
968 } else {
Peng Taod7e09d02013-05-02 16:46:55 +0800969 lo = 0;
970 hi = kiblnd_data.kib_peer_hash_size - 1;
971 }
972
973 for (i = lo; i <= hi; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200974 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +0800975 peer = list_entry(ptmp, kib_peer_t, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500976 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800977
978 if (peer->ibp_ni != ni)
979 continue;
980
981 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
982 continue;
983
984 count += kiblnd_close_peer_conns_locked(peer, 0);
985 }
986 }
987
988 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
989
990 /* wildcards always succeed */
991 if (nid == LNET_NID_ANY)
992 return 0;
993
James Simmons5fd88332016-02-12 12:06:09 -0500994 return !count ? -ENOENT : 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800995}
996
Frank Zago439b4d42016-03-02 17:02:00 -0500997static int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
Peng Taod7e09d02013-05-02 16:46:55 +0800998{
999 struct libcfs_ioctl_data *data = arg;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001000 int rc = -EINVAL;
Peng Taod7e09d02013-05-02 16:46:55 +08001001
Greg Donalda58a38a2014-08-21 12:40:35 -05001002 switch (cmd) {
Peng Taod7e09d02013-05-02 16:46:55 +08001003 case IOC_LIBCFS_GET_PEER: {
Mike Shueyec3d17c2015-05-19 10:14:36 -04001004 lnet_nid_t nid = 0;
1005 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001006
1007 rc = kiblnd_get_peer_info(ni, data->ioc_count,
1008 &nid, &count);
Mike Shueyec3d17c2015-05-19 10:14:36 -04001009 data->ioc_nid = nid;
1010 data->ioc_count = count;
Peng Taod7e09d02013-05-02 16:46:55 +08001011 break;
1012 }
1013
1014 case IOC_LIBCFS_DEL_PEER: {
1015 rc = kiblnd_del_peer(ni, data->ioc_nid);
1016 break;
1017 }
1018 case IOC_LIBCFS_GET_CONN: {
1019 kib_conn_t *conn;
1020
1021 rc = 0;
1022 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
James Simmons06ace262016-02-12 12:06:08 -05001023 if (!conn) {
Peng Taod7e09d02013-05-02 16:46:55 +08001024 rc = -ENOENT;
1025 break;
1026 }
1027
James Simmons06ace262016-02-12 12:06:08 -05001028 LASSERT(conn->ibc_cmid);
Peng Taod7e09d02013-05-02 16:46:55 +08001029 data->ioc_nid = conn->ibc_peer->ibp_nid;
James Simmons06ace262016-02-12 12:06:08 -05001030 if (!conn->ibc_cmid->route.path_rec)
Peng Taod7e09d02013-05-02 16:46:55 +08001031 data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1032 else
1033 data->ioc_u32[0] =
1034 ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1035 kiblnd_conn_decref(conn);
1036 break;
1037 }
1038 case IOC_LIBCFS_CLOSE_CONNECTION: {
1039 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1040 break;
1041 }
1042
1043 default:
1044 break;
1045 }
1046
1047 return rc;
1048}
1049
Frank Zago439b4d42016-03-02 17:02:00 -05001050static void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
Peng Taod7e09d02013-05-02 16:46:55 +08001051{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001052 unsigned long last_alive = 0;
1053 unsigned long now = cfs_time_current();
1054 rwlock_t *glock = &kiblnd_data.kib_global_lock;
1055 kib_peer_t *peer;
1056 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +08001057
1058 read_lock_irqsave(glock, flags);
1059
1060 peer = kiblnd_find_peer_locked(nid);
Liang Zhen4d99b252016-03-02 18:53:29 -05001061 if (peer)
Peng Taod7e09d02013-05-02 16:46:55 +08001062 last_alive = peer->ibp_last_alive;
Peng Taod7e09d02013-05-02 16:46:55 +08001063
1064 read_unlock_irqrestore(glock, flags);
1065
James Simmons5fd88332016-02-12 12:06:09 -05001066 if (last_alive)
Peng Taod7e09d02013-05-02 16:46:55 +08001067 *when = last_alive;
1068
James Simmons4420cfd2016-02-12 12:06:00 -05001069 /*
1070 * peer is not persistent in hash, trigger peer creation
1071 * and connection establishment with a NULL tx
1072 */
James Simmons06ace262016-02-12 12:06:08 -05001073 if (!peer)
Peng Taod7e09d02013-05-02 16:46:55 +08001074 kiblnd_launch_tx(ni, NULL, nid);
1075
1076 CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1077 libcfs_nid2str(nid), peer,
1078 last_alive ? cfs_duration_sec(now - last_alive) : -1);
Peng Taod7e09d02013-05-02 16:46:55 +08001079}
1080
Frank Zago439b4d42016-03-02 17:02:00 -05001081static void kiblnd_free_pages(kib_pages_t *p)
Peng Taod7e09d02013-05-02 16:46:55 +08001082{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001083 int npages = p->ibp_npages;
1084 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001085
1086 for (i = 0; i < npages; i++) {
James Simmons06ace262016-02-12 12:06:08 -05001087 if (p->ibp_pages[i])
Peng Taod7e09d02013-05-02 16:46:55 +08001088 __free_page(p->ibp_pages[i]);
1089 }
1090
1091 LIBCFS_FREE(p, offsetof(kib_pages_t, ibp_pages[npages]));
1092}
1093
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001094int kiblnd_alloc_pages(kib_pages_t **pp, int cpt, int npages)
Peng Taod7e09d02013-05-02 16:46:55 +08001095{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001096 kib_pages_t *p;
1097 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001098
1099 LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
1100 offsetof(kib_pages_t, ibp_pages[npages]));
James Simmons06ace262016-02-12 12:06:08 -05001101 if (!p) {
Peng Taod7e09d02013-05-02 16:46:55 +08001102 CERROR("Can't allocate descriptor for %d pages\n", npages);
1103 return -ENOMEM;
1104 }
1105
1106 memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
1107 p->ibp_npages = npages;
1108
1109 for (i = 0; i < npages; i++) {
Peng Tao49c02a72013-06-03 21:58:22 +08001110 p->ibp_pages[i] = alloc_pages_node(
1111 cfs_cpt_spread_node(lnet_cpt_table(), cpt),
Ann Koehler0be19af2014-04-27 13:06:36 -04001112 GFP_NOFS, 0);
James Simmons06ace262016-02-12 12:06:08 -05001113 if (!p->ibp_pages[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +08001114 CERROR("Can't allocate page %d of %d\n", i, npages);
1115 kiblnd_free_pages(p);
1116 return -ENOMEM;
1117 }
1118 }
1119
1120 *pp = p;
1121 return 0;
1122}
1123
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001124void kiblnd_unmap_rx_descs(kib_conn_t *conn)
Peng Taod7e09d02013-05-02 16:46:55 +08001125{
1126 kib_rx_t *rx;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001127 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001128
James Simmons06ace262016-02-12 12:06:08 -05001129 LASSERT(conn->ibc_rxs);
1130 LASSERT(conn->ibc_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08001131
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001132 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +08001133 rx = &conn->ibc_rxs[i];
1134
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001135 LASSERT(rx->rx_nob >= 0); /* not posted */
Peng Taod7e09d02013-05-02 16:46:55 +08001136
1137 kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1138 KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1139 rx->rx_msgaddr),
1140 IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1141 }
1142
1143 kiblnd_free_pages(conn->ibc_rx_pages);
1144
1145 conn->ibc_rx_pages = NULL;
1146}
1147
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001148void kiblnd_map_rx_descs(kib_conn_t *conn)
Peng Taod7e09d02013-05-02 16:46:55 +08001149{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001150 kib_rx_t *rx;
1151 struct page *pg;
1152 int pg_off;
1153 int ipg;
1154 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001155
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001156 for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +08001157 pg = conn->ibc_rx_pages->ibp_pages[ipg];
1158 rx = &conn->ibc_rxs[i];
1159
1160 rx->rx_conn = conn;
1161 rx->rx_msg = (kib_msg_t *)(((char *)page_address(pg)) + pg_off);
1162
1163 rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02001164 rx->rx_msg,
1165 IBLND_MSG_SIZE,
Peng Taod7e09d02013-05-02 16:46:55 +08001166 DMA_FROM_DEVICE);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001167 LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
James Simmonsc314c312016-02-12 12:06:01 -05001168 rx->rx_msgaddr));
Peng Taod7e09d02013-05-02 16:46:55 +08001169 KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1170
Greg Donald1d8cb702014-08-25 20:07:19 -05001171 CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
Peng Taod7e09d02013-05-02 16:46:55 +08001172 i, rx->rx_msg, rx->rx_msgaddr,
John L. Hammondd664d1f2015-06-11 15:18:08 -04001173 (__u64)(page_to_phys(pg) + pg_off));
Peng Taod7e09d02013-05-02 16:46:55 +08001174
1175 pg_off += IBLND_MSG_SIZE;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001176 LASSERT(pg_off <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001177
1178 if (pg_off == PAGE_SIZE) {
1179 pg_off = 0;
1180 ipg++;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001181 LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
Peng Taod7e09d02013-05-02 16:46:55 +08001182 }
1183 }
1184}
1185
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001186static void kiblnd_unmap_tx_pool(kib_tx_pool_t *tpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001187{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001188 kib_hca_dev_t *hdev = tpo->tpo_hdev;
1189 kib_tx_t *tx;
1190 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001191
James Simmons5fd88332016-02-12 12:06:09 -05001192 LASSERT(!tpo->tpo_pool.po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001193
James Simmons06ace262016-02-12 12:06:08 -05001194 if (!hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08001195 return;
1196
1197 for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1198 tx = &tpo->tpo_tx_descs[i];
1199 kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1200 KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1201 tx->tx_msgaddr),
1202 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1203 }
1204
1205 kiblnd_hdev_decref(hdev);
1206 tpo->tpo_hdev = NULL;
1207}
1208
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001209static kib_hca_dev_t *kiblnd_current_hdev(kib_dev_t *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08001210{
1211 kib_hca_dev_t *hdev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001212 unsigned long flags;
1213 int i = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001214
1215 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1216 while (dev->ibd_failover) {
1217 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
James Simmons5fd88332016-02-12 12:06:09 -05001218 if (!(i++ % 50))
Peng Taod7e09d02013-05-02 16:46:55 +08001219 CDEBUG(D_NET, "%s: Wait for failover\n",
1220 dev->ibd_ifname);
Liang Zhenea363b42016-03-02 18:53:30 -05001221 set_current_state(TASK_INTERRUPTIBLE);
Peng Taod7e09d02013-05-02 16:46:55 +08001222 schedule_timeout(cfs_time_seconds(1) / 100);
1223
1224 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1225 }
1226
1227 kiblnd_hdev_addref_locked(dev->ibd_hdev);
1228 hdev = dev->ibd_hdev;
1229
1230 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1231
1232 return hdev;
1233}
1234
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001235static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001236{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001237 kib_pages_t *txpgs = tpo->tpo_tx_pages;
1238 kib_pool_t *pool = &tpo->tpo_pool;
1239 kib_net_t *net = pool->po_owner->ps_net;
1240 kib_dev_t *dev;
1241 struct page *page;
1242 kib_tx_t *tx;
1243 int page_offset;
1244 int ipage;
1245 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001246
James Simmons06ace262016-02-12 12:06:08 -05001247 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +08001248
1249 dev = net->ibn_dev;
1250
1251 /* pre-mapped messages are not bigger than 1 page */
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001252 CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001253
1254 /* No fancy arithmetic when we do the buffer calculations */
James Simmons5fd88332016-02-12 12:06:09 -05001255 CLASSERT(!(PAGE_SIZE % IBLND_MSG_SIZE));
Peng Taod7e09d02013-05-02 16:46:55 +08001256
1257 tpo->tpo_hdev = kiblnd_current_hdev(dev);
1258
1259 for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1260 page = txpgs->ibp_pages[ipage];
1261 tx = &tpo->tpo_tx_descs[i];
1262
1263 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
1264 page_offset);
1265
1266 tx->tx_msgaddr = kiblnd_dma_map_single(
1267 tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1268 IBLND_MSG_SIZE, DMA_TO_DEVICE);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001269 LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
James Simmonsc314c312016-02-12 12:06:01 -05001270 tx->tx_msgaddr));
Peng Taod7e09d02013-05-02 16:46:55 +08001271 KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1272
1273 list_add(&tx->tx_list, &pool->po_free_list);
1274
1275 page_offset += IBLND_MSG_SIZE;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001276 LASSERT(page_offset <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001277
1278 if (page_offset == PAGE_SIZE) {
1279 page_offset = 0;
1280 ipage++;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001281 LASSERT(ipage <= txpgs->ibp_npages);
Peng Taod7e09d02013-05-02 16:46:55 +08001282 }
1283 }
1284}
1285
Amir Shehata32c8deb82016-05-06 21:30:28 -04001286struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001287 int negotiated_nfrags)
Peng Taod7e09d02013-05-02 16:46:55 +08001288{
Amir Shehata32c8deb82016-05-06 21:30:28 -04001289 kib_net_t *net = ni->ni_data;
1290 kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev;
1291 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
1292 __u16 nfrags;
1293 int mod;
1294
1295 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
1296 mod = tunables->lnd_map_on_demand;
1297 nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001298
Amir Shehata7cadcc72016-03-02 17:02:03 -05001299 LASSERT(hdev->ibh_mrs);
Peng Taod7e09d02013-05-02 16:46:55 +08001300
Amir Shehata32c8deb82016-05-06 21:30:28 -04001301 if (mod > 0 && nfrags <= rd->rd_nfrags)
Peng Taod7e09d02013-05-02 16:46:55 +08001302 return NULL;
1303
Amir Shehata7cadcc72016-03-02 17:02:03 -05001304 return hdev->ibh_mrs;
Peng Taod7e09d02013-05-02 16:46:55 +08001305}
1306
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001307static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001308{
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001309 LASSERT(!fpo->fpo_map_count);
Peng Taod7e09d02013-05-02 16:46:55 +08001310
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001311 if (fpo->fpo_is_fmr) {
1312 if (fpo->fmr.fpo_fmr_pool)
1313 ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
1314 } else {
1315 struct kib_fast_reg_descriptor *frd, *tmp;
1316 int i = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001317
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001318 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1319 frd_list) {
1320 list_del(&frd->frd_list);
1321 ib_dereg_mr(frd->frd_mr);
1322 LIBCFS_FREE(frd, sizeof(*frd));
1323 i++;
1324 }
1325 if (i < fpo->fast_reg.fpo_pool_size)
1326 CERROR("FastReg pool still has %d regions registered\n",
1327 fpo->fast_reg.fpo_pool_size - i);
1328 }
Peng Taod7e09d02013-05-02 16:46:55 +08001329
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001330 if (fpo->fpo_hdev)
1331 kiblnd_hdev_decref(fpo->fpo_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08001332
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001333 LIBCFS_FREE(fpo, sizeof(*fpo));
Peng Taod7e09d02013-05-02 16:46:55 +08001334}
1335
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001336static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
Peng Taod7e09d02013-05-02 16:46:55 +08001337{
Dmitry Eremin0d33ec52016-05-05 14:53:01 -04001338 kib_fmr_pool_t *fpo, *tmp;
Peng Taod7e09d02013-05-02 16:46:55 +08001339
Dmitry Eremin0d33ec52016-05-05 14:53:01 -04001340 list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001341 list_del(&fpo->fpo_list);
1342 kiblnd_destroy_fmr_pool(fpo);
Peng Taod7e09d02013-05-02 16:46:55 +08001343 }
1344}
1345
Amir Shehata32c8deb82016-05-06 21:30:28 -04001346static int
1347kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1348 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08001349{
Amir Shehata32c8deb82016-05-06 21:30:28 -04001350 int size = tunables->lnd_fmr_pool_size / ncpts;
Peng Taod7e09d02013-05-02 16:46:55 +08001351
1352 return max(IBLND_FMR_POOL, size);
1353}
1354
Amir Shehata32c8deb82016-05-06 21:30:28 -04001355static int
1356kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1357 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08001358{
Amir Shehata32c8deb82016-05-06 21:30:28 -04001359 int size = tunables->lnd_fmr_flush_trigger / ncpts;
Peng Taod7e09d02013-05-02 16:46:55 +08001360
1361 return max(IBLND_FMR_POOL_FLUSH, size);
1362}
1363
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001364static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001365{
Peng Taod7e09d02013-05-02 16:46:55 +08001366 struct ib_fmr_pool_param param = {
James Simmons51078e22016-02-12 12:06:04 -05001367 .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
Mike Shueyec3d17c2015-05-19 10:14:36 -04001368 .page_shift = PAGE_SHIFT,
1369 .access = (IB_ACCESS_LOCAL_WRITE |
Miguel Bernabeu Diaze39f6ef2015-08-05 23:44:36 +02001370 IB_ACCESS_REMOTE_WRITE),
Mike Shueyec3d17c2015-05-19 10:14:36 -04001371 .pool_size = fps->fps_pool_size,
Peng Taod7e09d02013-05-02 16:46:55 +08001372 .dirty_watermark = fps->fps_flush_trigger,
1373 .flush_function = NULL,
Mike Shueyec3d17c2015-05-19 10:14:36 -04001374 .flush_arg = NULL,
Amir Shehata32c8deb82016-05-06 21:30:28 -04001375 .cache = !!fps->fps_cache };
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001376 int rc = 0;
1377
1378 fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
1379 &param);
1380 if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
1381 rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
1382 if (rc != -ENOSYS)
1383 CERROR("Failed to create FMR pool: %d\n", rc);
1384 else
1385 CERROR("FMRs are not supported\n");
1386 }
1387
1388 return rc;
1389}
1390
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001391static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
1392{
1393 struct kib_fast_reg_descriptor *frd, *tmp;
1394 int i, rc;
1395
1396 INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
1397 fpo->fast_reg.fpo_pool_size = 0;
1398 for (i = 0; i < fps->fps_pool_size; i++) {
1399 LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
1400 sizeof(*frd));
1401 if (!frd) {
1402 CERROR("Failed to allocate a new fast_reg descriptor\n");
1403 rc = -ENOMEM;
1404 goto out;
1405 }
1406
1407 frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
1408 IB_MR_TYPE_MEM_REG,
1409 LNET_MAX_PAYLOAD / PAGE_SIZE);
1410 if (IS_ERR(frd->frd_mr)) {
1411 rc = PTR_ERR(frd->frd_mr);
1412 CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
1413 frd->frd_mr = NULL;
1414 goto out_middle;
1415 }
1416
1417 frd->frd_valid = true;
1418
1419 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1420 fpo->fast_reg.fpo_pool_size++;
1421 }
1422
1423 return 0;
1424
1425out_middle:
1426 if (frd->frd_mr)
1427 ib_dereg_mr(frd->frd_mr);
1428 LIBCFS_FREE(frd, sizeof(*frd));
1429
1430out:
1431 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1432 frd_list) {
1433 list_del(&frd->frd_list);
1434 ib_dereg_mr(frd->frd_mr);
1435 LIBCFS_FREE(frd, sizeof(*frd));
1436 }
1437
1438 return rc;
1439}
1440
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001441static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
1442 kib_fmr_pool_t **pp_fpo)
1443{
1444 kib_dev_t *dev = fps->fps_net->ibn_dev;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001445 struct ib_device_attr *dev_attr;
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001446 kib_fmr_pool_t *fpo;
Peng Taod7e09d02013-05-02 16:46:55 +08001447 int rc;
1448
1449 LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
James Simmons06ace262016-02-12 12:06:08 -05001450 if (!fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001451 return -ENOMEM;
1452
1453 fpo->fpo_hdev = kiblnd_current_hdev(dev);
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001454 dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
Peng Taod7e09d02013-05-02 16:46:55 +08001455
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001456 /* Check for FMR or FastReg support */
1457 fpo->fpo_is_fmr = 0;
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001458 if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
1459 fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
1460 fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
1461 fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
1462 LCONSOLE_INFO("Using FMR for registration\n");
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001463 fpo->fpo_is_fmr = 1;
1464 } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1465 LCONSOLE_INFO("Using FastReg for registration\n");
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001466 } else {
1467 rc = -ENOSYS;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001468 LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001469 goto out_fpo;
Peng Taod7e09d02013-05-02 16:46:55 +08001470 }
1471
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001472 if (fpo->fpo_is_fmr)
1473 rc = kiblnd_alloc_fmr_pool(fps, fpo);
1474 else
1475 rc = kiblnd_alloc_freg_pool(fps, fpo);
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001476 if (rc)
1477 goto out_fpo;
1478
Peng Taod7e09d02013-05-02 16:46:55 +08001479 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001480 fpo->fpo_owner = fps;
Peng Taod7e09d02013-05-02 16:46:55 +08001481 *pp_fpo = fpo;
1482
1483 return 0;
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001484
1485out_fpo:
1486 kiblnd_hdev_decref(fpo->fpo_hdev);
1487 LIBCFS_FREE(fpo, sizeof(*fpo));
1488 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001489}
1490
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001491static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps,
1492 struct list_head *zombies)
Peng Taod7e09d02013-05-02 16:46:55 +08001493{
James Simmons06ace262016-02-12 12:06:08 -05001494 if (!fps->fps_net) /* intialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001495 return;
1496
1497 spin_lock(&fps->fps_lock);
1498
1499 while (!list_empty(&fps->fps_pool_list)) {
1500 kib_fmr_pool_t *fpo = list_entry(fps->fps_pool_list.next,
1501 kib_fmr_pool_t, fpo_list);
1502 fpo->fpo_failed = 1;
1503 list_del(&fpo->fpo_list);
James Simmons5fd88332016-02-12 12:06:09 -05001504 if (!fpo->fpo_map_count)
Peng Taod7e09d02013-05-02 16:46:55 +08001505 list_add(&fpo->fpo_list, zombies);
1506 else
1507 list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1508 }
1509
1510 spin_unlock(&fps->fps_lock);
1511}
1512
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001513static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
Peng Taod7e09d02013-05-02 16:46:55 +08001514{
James Simmons06ace262016-02-12 12:06:08 -05001515 if (fps->fps_net) { /* initialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001516 kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1517 kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1518 }
1519}
1520
Amir Shehata32c8deb82016-05-06 21:30:28 -04001521static int
1522kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
1523 kib_net_t *net,
1524 struct lnet_ioctl_config_o2iblnd_tunables *tunables)
Peng Taod7e09d02013-05-02 16:46:55 +08001525{
1526 kib_fmr_pool_t *fpo;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001527 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001528
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001529 memset(fps, 0, sizeof(*fps));
Peng Taod7e09d02013-05-02 16:46:55 +08001530
1531 fps->fps_net = net;
1532 fps->fps_cpt = cpt;
Amir Shehata32c8deb82016-05-06 21:30:28 -04001533
1534 fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
1535 fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
1536 fps->fps_cache = tunables->lnd_fmr_cache;
1537
Peng Taod7e09d02013-05-02 16:46:55 +08001538 spin_lock_init(&fps->fps_lock);
1539 INIT_LIST_HEAD(&fps->fps_pool_list);
1540 INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1541
1542 rc = kiblnd_create_fmr_pool(fps, &fpo);
James Simmons5fd88332016-02-12 12:06:09 -05001543 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08001544 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1545
1546 return rc;
1547}
1548
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001549static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now)
Peng Taod7e09d02013-05-02 16:46:55 +08001550{
James Simmons5fd88332016-02-12 12:06:09 -05001551 if (fpo->fpo_map_count) /* still in use */
Peng Taod7e09d02013-05-02 16:46:55 +08001552 return 0;
1553 if (fpo->fpo_failed)
1554 return 1;
1555 return cfs_time_aftereq(now, fpo->fpo_deadline);
1556}
1557
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001558static int
1559kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
1560{
1561 __u64 *pages = tx->tx_pages;
1562 kib_hca_dev_t *hdev;
1563 int npages;
1564 int size;
1565 int i;
1566
1567 hdev = tx->tx_pool->tpo_hdev;
1568
1569 for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
1570 for (size = 0; size < rd->rd_frags[i].rf_nob;
1571 size += hdev->ibh_page_size) {
1572 pages[npages++] = (rd->rd_frags[i].rf_addr &
1573 hdev->ibh_page_mask) + size;
1574 }
1575 }
1576
1577 return npages;
1578}
1579
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001580void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
Peng Taod7e09d02013-05-02 16:46:55 +08001581{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001582 LIST_HEAD(zombies);
Mike Shueyec3d17c2015-05-19 10:14:36 -04001583 kib_fmr_pool_t *fpo = fmr->fmr_pool;
Dmitry Eremin1f199a02016-05-05 14:53:05 -04001584 kib_fmr_poolset_t *fps;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001585 unsigned long now = cfs_time_current();
1586 kib_fmr_pool_t *tmp;
1587 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001588
Dmitry Eremin1f199a02016-05-05 14:53:05 -04001589 if (!fpo)
1590 return;
Peng Taod7e09d02013-05-02 16:46:55 +08001591
Dmitry Eremin1f199a02016-05-05 14:53:05 -04001592 fps = fpo->fpo_owner;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001593 if (fpo->fpo_is_fmr) {
1594 if (fmr->fmr_pfmr) {
1595 rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1596 LASSERT(!rc);
1597 fmr->fmr_pfmr = NULL;
1598 }
Peng Taod7e09d02013-05-02 16:46:55 +08001599
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001600 if (status) {
1601 rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
1602 LASSERT(!rc);
1603 }
1604 } else {
1605 struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
Peng Taod7e09d02013-05-02 16:46:55 +08001606
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001607 if (frd) {
1608 frd->frd_valid = false;
1609 spin_lock(&fps->fps_lock);
1610 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1611 spin_unlock(&fps->fps_lock);
1612 fmr->fmr_frd = NULL;
1613 }
Peng Taod7e09d02013-05-02 16:46:55 +08001614 }
Peng Taod7e09d02013-05-02 16:46:55 +08001615 fmr->fmr_pool = NULL;
Peng Taod7e09d02013-05-02 16:46:55 +08001616
1617 spin_lock(&fps->fps_lock);
Igor Ishchenko747327972015-01-12 18:16:26 +02001618 fpo->fpo_map_count--; /* decref the pool */
Peng Taod7e09d02013-05-02 16:46:55 +08001619
1620 list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1621 /* the first pool is persistent */
1622 if (fps->fps_pool_list.next == &fpo->fpo_list)
1623 continue;
1624
1625 if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1626 list_move(&fpo->fpo_list, &zombies);
Igor Ishchenko747327972015-01-12 18:16:26 +02001627 fps->fps_version++;
Peng Taod7e09d02013-05-02 16:46:55 +08001628 }
1629 }
1630 spin_unlock(&fps->fps_lock);
1631
1632 if (!list_empty(&zombies))
1633 kiblnd_destroy_fmr_pool_list(&zombies);
1634}
1635
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001636int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
1637 kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
1638 kib_fmr_t *fmr)
Peng Taod7e09d02013-05-02 16:46:55 +08001639{
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001640 __u64 *pages = tx->tx_pages;
1641 bool is_rx = (rd != tx->tx_rd);
1642 bool tx_pages_mapped = 0;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001643 kib_fmr_pool_t *fpo;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001644 int npages = 0;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001645 __u64 version;
1646 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001647
1648 again:
1649 spin_lock(&fps->fps_lock);
1650 version = fps->fps_version;
1651 list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1652 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1653 fpo->fpo_map_count++;
Peng Taod7e09d02013-05-02 16:46:55 +08001654
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001655 if (fpo->fpo_is_fmr) {
1656 struct ib_pool_fmr *pfmr;
1657
1658 spin_unlock(&fps->fps_lock);
1659
1660 if (!tx_pages_mapped) {
1661 npages = kiblnd_map_tx_pages(tx, rd);
1662 tx_pages_mapped = 1;
1663 }
1664
1665 pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
1666 pages, npages, iov);
1667 if (likely(!IS_ERR(pfmr))) {
1668 fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
1669 pfmr->fmr->lkey;
1670 fmr->fmr_frd = NULL;
1671 fmr->fmr_pfmr = pfmr;
1672 fmr->fmr_pool = fpo;
1673 return 0;
1674 }
1675 rc = PTR_ERR(pfmr);
1676 } else {
1677 if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
1678 struct kib_fast_reg_descriptor *frd;
1679 struct ib_reg_wr *wr;
1680 struct ib_mr *mr;
1681 int n;
1682
1683 frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
1684 struct kib_fast_reg_descriptor,
1685 frd_list);
1686 list_del(&frd->frd_list);
1687 spin_unlock(&fps->fps_lock);
1688
1689 mr = frd->frd_mr;
1690
1691 if (!frd->frd_valid) {
1692 __u32 key = is_rx ? mr->rkey : mr->lkey;
1693 struct ib_send_wr *inv_wr;
1694
1695 inv_wr = &frd->frd_inv_wr;
1696 memset(inv_wr, 0, sizeof(*inv_wr));
1697 inv_wr->opcode = IB_WR_LOCAL_INV;
1698 inv_wr->wr_id = IBLND_WID_MR;
1699 inv_wr->ex.invalidate_rkey = key;
1700
1701 /* Bump the key */
1702 key = ib_inc_rkey(key);
1703 ib_update_fast_reg_key(mr, key);
1704 }
1705
1706 n = ib_map_mr_sg(mr, tx->tx_frags,
Linus Torvalds2f37dd12016-05-20 22:20:48 -07001707 tx->tx_nfrags, NULL, PAGE_SIZE);
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001708 if (unlikely(n != tx->tx_nfrags)) {
1709 CERROR("Failed to map mr %d/%d elements\n",
1710 n, tx->tx_nfrags);
1711 return n < 0 ? n : -EINVAL;
1712 }
1713
1714 mr->iova = iov;
1715
1716 /* Prepare FastReg WR */
1717 wr = &frd->frd_fastreg_wr;
1718 memset(wr, 0, sizeof(*wr));
1719 wr->wr.opcode = IB_WR_REG_MR;
1720 wr->wr.wr_id = IBLND_WID_MR;
1721 wr->wr.num_sge = 0;
1722 wr->wr.send_flags = 0;
1723 wr->mr = mr;
1724 wr->key = is_rx ? mr->rkey : mr->lkey;
1725 wr->access = (IB_ACCESS_LOCAL_WRITE |
1726 IB_ACCESS_REMOTE_WRITE);
1727
1728 fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
1729 fmr->fmr_frd = frd;
1730 fmr->fmr_pfmr = NULL;
1731 fmr->fmr_pool = fpo;
1732 return 0;
1733 }
1734 spin_unlock(&fps->fps_lock);
1735 rc = -EBUSY;
Peng Taod7e09d02013-05-02 16:46:55 +08001736 }
1737
1738 spin_lock(&fps->fps_lock);
1739 fpo->fpo_map_count--;
Dmitry Ereminc1b2e0b2016-05-05 14:53:04 -04001740 if (rc != -EAGAIN) {
Peng Taod7e09d02013-05-02 16:46:55 +08001741 spin_unlock(&fps->fps_lock);
Dmitry Ereminc1b2e0b2016-05-05 14:53:04 -04001742 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001743 }
1744
1745 /* EAGAIN and ... */
1746 if (version != fps->fps_version) {
1747 spin_unlock(&fps->fps_lock);
1748 goto again;
1749 }
1750 }
1751
1752 if (fps->fps_increasing) {
1753 spin_unlock(&fps->fps_lock);
James Simmonsc314c312016-02-12 12:06:01 -05001754 CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
Peng Taod7e09d02013-05-02 16:46:55 +08001755 schedule();
1756 goto again;
Peng Taod7e09d02013-05-02 16:46:55 +08001757 }
1758
Greg Kroah-Hartman699503b2014-07-12 01:03:41 -07001759 if (time_before(cfs_time_current(), fps->fps_next_retry)) {
Peng Taod7e09d02013-05-02 16:46:55 +08001760 /* someone failed recently */
1761 spin_unlock(&fps->fps_lock);
1762 return -EAGAIN;
1763 }
1764
1765 fps->fps_increasing = 1;
1766 spin_unlock(&fps->fps_lock);
1767
1768 CDEBUG(D_NET, "Allocate new FMR pool\n");
1769 rc = kiblnd_create_fmr_pool(fps, &fpo);
1770 spin_lock(&fps->fps_lock);
1771 fps->fps_increasing = 0;
James Simmons5fd88332016-02-12 12:06:09 -05001772 if (!rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08001773 fps->fps_version++;
1774 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1775 } else {
1776 fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1777 }
1778 spin_unlock(&fps->fps_lock);
1779
1780 goto again;
1781}
1782
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001783static void kiblnd_fini_pool(kib_pool_t *pool)
Peng Taod7e09d02013-05-02 16:46:55 +08001784{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001785 LASSERT(list_empty(&pool->po_free_list));
James Simmons5fd88332016-02-12 12:06:09 -05001786 LASSERT(!pool->po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001787
1788 CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1789}
1790
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001791static void kiblnd_init_pool(kib_poolset_t *ps, kib_pool_t *pool, int size)
Peng Taod7e09d02013-05-02 16:46:55 +08001792{
1793 CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1794
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001795 memset(pool, 0, sizeof(*pool));
Peng Taod7e09d02013-05-02 16:46:55 +08001796 INIT_LIST_HEAD(&pool->po_free_list);
1797 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1798 pool->po_owner = ps;
1799 pool->po_size = size;
1800}
1801
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001802static void kiblnd_destroy_pool_list(struct list_head *head)
Peng Taod7e09d02013-05-02 16:46:55 +08001803{
1804 kib_pool_t *pool;
1805
1806 while (!list_empty(head)) {
1807 pool = list_entry(head->next, kib_pool_t, po_list);
1808 list_del(&pool->po_list);
1809
James Simmons06ace262016-02-12 12:06:08 -05001810 LASSERT(pool->po_owner);
Peng Taod7e09d02013-05-02 16:46:55 +08001811 pool->po_owner->ps_pool_destroy(pool);
1812 }
1813}
1814
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001815static void kiblnd_fail_poolset(kib_poolset_t *ps, struct list_head *zombies)
Peng Taod7e09d02013-05-02 16:46:55 +08001816{
James Simmons06ace262016-02-12 12:06:08 -05001817 if (!ps->ps_net) /* intialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001818 return;
1819
1820 spin_lock(&ps->ps_lock);
1821 while (!list_empty(&ps->ps_pool_list)) {
1822 kib_pool_t *po = list_entry(ps->ps_pool_list.next,
1823 kib_pool_t, po_list);
1824 po->po_failed = 1;
1825 list_del(&po->po_list);
James Simmons5fd88332016-02-12 12:06:09 -05001826 if (!po->po_allocated)
Peng Taod7e09d02013-05-02 16:46:55 +08001827 list_add(&po->po_list, zombies);
1828 else
1829 list_add(&po->po_list, &ps->ps_failed_pool_list);
1830 }
1831 spin_unlock(&ps->ps_lock);
1832}
1833
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001834static void kiblnd_fini_poolset(kib_poolset_t *ps)
Peng Taod7e09d02013-05-02 16:46:55 +08001835{
James Simmons06ace262016-02-12 12:06:08 -05001836 if (ps->ps_net) { /* initialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001837 kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1838 kiblnd_destroy_pool_list(&ps->ps_pool_list);
1839 }
1840}
1841
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001842static int kiblnd_init_poolset(kib_poolset_t *ps, int cpt,
1843 kib_net_t *net, char *name, int size,
1844 kib_ps_pool_create_t po_create,
1845 kib_ps_pool_destroy_t po_destroy,
1846 kib_ps_node_init_t nd_init,
1847 kib_ps_node_fini_t nd_fini)
Peng Taod7e09d02013-05-02 16:46:55 +08001848{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001849 kib_pool_t *pool;
1850 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001851
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001852 memset(ps, 0, sizeof(*ps));
Peng Taod7e09d02013-05-02 16:46:55 +08001853
Mike Shueyec3d17c2015-05-19 10:14:36 -04001854 ps->ps_cpt = cpt;
1855 ps->ps_net = net;
Peng Taod7e09d02013-05-02 16:46:55 +08001856 ps->ps_pool_create = po_create;
1857 ps->ps_pool_destroy = po_destroy;
1858 ps->ps_node_init = nd_init;
1859 ps->ps_node_fini = nd_fini;
1860 ps->ps_pool_size = size;
1861 if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
1862 >= sizeof(ps->ps_name))
1863 return -E2BIG;
1864 spin_lock_init(&ps->ps_lock);
1865 INIT_LIST_HEAD(&ps->ps_pool_list);
1866 INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1867
1868 rc = ps->ps_pool_create(ps, size, &pool);
James Simmons5fd88332016-02-12 12:06:09 -05001869 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08001870 list_add(&pool->po_list, &ps->ps_pool_list);
1871 else
1872 CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1873
1874 return rc;
1875}
1876
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001877static int kiblnd_pool_is_idle(kib_pool_t *pool, unsigned long now)
Peng Taod7e09d02013-05-02 16:46:55 +08001878{
James Simmons5fd88332016-02-12 12:06:09 -05001879 if (pool->po_allocated) /* still in use */
Peng Taod7e09d02013-05-02 16:46:55 +08001880 return 0;
1881 if (pool->po_failed)
1882 return 1;
1883 return cfs_time_aftereq(now, pool->po_deadline);
1884}
1885
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001886void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node)
Peng Taod7e09d02013-05-02 16:46:55 +08001887{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001888 LIST_HEAD(zombies);
Mike Shueyec3d17c2015-05-19 10:14:36 -04001889 kib_poolset_t *ps = pool->po_owner;
1890 kib_pool_t *tmp;
1891 unsigned long now = cfs_time_current();
Peng Taod7e09d02013-05-02 16:46:55 +08001892
1893 spin_lock(&ps->ps_lock);
1894
James Simmons06ace262016-02-12 12:06:08 -05001895 if (ps->ps_node_fini)
Peng Taod7e09d02013-05-02 16:46:55 +08001896 ps->ps_node_fini(pool, node);
1897
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001898 LASSERT(pool->po_allocated > 0);
Peng Taod7e09d02013-05-02 16:46:55 +08001899 list_add(node, &pool->po_free_list);
Igor Ishchenko747327972015-01-12 18:16:26 +02001900 pool->po_allocated--;
Peng Taod7e09d02013-05-02 16:46:55 +08001901
1902 list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1903 /* the first pool is persistent */
1904 if (ps->ps_pool_list.next == &pool->po_list)
1905 continue;
1906
1907 if (kiblnd_pool_is_idle(pool, now))
1908 list_move(&pool->po_list, &zombies);
1909 }
1910 spin_unlock(&ps->ps_lock);
1911
1912 if (!list_empty(&zombies))
1913 kiblnd_destroy_pool_list(&zombies);
1914}
1915
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001916struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps)
Peng Taod7e09d02013-05-02 16:46:55 +08001917{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001918 struct list_head *node;
1919 kib_pool_t *pool;
Liang Zhenea363b42016-03-02 18:53:30 -05001920 unsigned int interval = 1;
1921 unsigned long time_before;
1922 unsigned int trips = 0;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001923 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001924
1925 again:
1926 spin_lock(&ps->ps_lock);
1927 list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1928 if (list_empty(&pool->po_free_list))
1929 continue;
1930
Igor Ishchenko747327972015-01-12 18:16:26 +02001931 pool->po_allocated++;
Peng Taod7e09d02013-05-02 16:46:55 +08001932 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1933 node = pool->po_free_list.next;
1934 list_del(node);
1935
James Simmons06ace262016-02-12 12:06:08 -05001936 if (ps->ps_node_init) {
Peng Taod7e09d02013-05-02 16:46:55 +08001937 /* still hold the lock */
1938 ps->ps_node_init(pool, node);
1939 }
1940 spin_unlock(&ps->ps_lock);
1941 return node;
1942 }
1943
1944 /* no available tx pool and ... */
1945 if (ps->ps_increasing) {
1946 /* another thread is allocating a new pool */
1947 spin_unlock(&ps->ps_lock);
Liang Zhenea363b42016-03-02 18:53:30 -05001948 trips++;
1949 CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n",
1950 ps->ps_name, interval, trips);
1951
1952 set_current_state(TASK_INTERRUPTIBLE);
1953 schedule_timeout(interval);
1954 if (interval < cfs_time_seconds(1))
1955 interval *= 2;
1956
Peng Taod7e09d02013-05-02 16:46:55 +08001957 goto again;
1958 }
1959
Greg Kroah-Hartman699503b2014-07-12 01:03:41 -07001960 if (time_before(cfs_time_current(), ps->ps_next_retry)) {
Peng Taod7e09d02013-05-02 16:46:55 +08001961 /* someone failed recently */
1962 spin_unlock(&ps->ps_lock);
1963 return NULL;
1964 }
1965
1966 ps->ps_increasing = 1;
1967 spin_unlock(&ps->ps_lock);
1968
1969 CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
Liang Zhenea363b42016-03-02 18:53:30 -05001970 time_before = cfs_time_current();
Peng Taod7e09d02013-05-02 16:46:55 +08001971 rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
Liang Zhenea363b42016-03-02 18:53:30 -05001972 CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
1973 cfs_time_current() - time_before);
Peng Taod7e09d02013-05-02 16:46:55 +08001974
1975 spin_lock(&ps->ps_lock);
1976 ps->ps_increasing = 0;
James Simmons5fd88332016-02-12 12:06:09 -05001977 if (!rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08001978 list_add_tail(&pool->po_list, &ps->ps_pool_list);
1979 } else {
1980 ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1981 CERROR("Can't allocate new %s pool because out of memory\n",
1982 ps->ps_name);
1983 }
1984 spin_unlock(&ps->ps_lock);
1985
1986 goto again;
1987}
1988
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001989static void kiblnd_destroy_tx_pool(kib_pool_t *pool)
Peng Taod7e09d02013-05-02 16:46:55 +08001990{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001991 kib_tx_pool_t *tpo = container_of(pool, kib_tx_pool_t, tpo_pool);
1992 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001993
James Simmons5fd88332016-02-12 12:06:09 -05001994 LASSERT(!pool->po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001995
James Simmons06ace262016-02-12 12:06:08 -05001996 if (tpo->tpo_tx_pages) {
Peng Taod7e09d02013-05-02 16:46:55 +08001997 kiblnd_unmap_tx_pool(tpo);
1998 kiblnd_free_pages(tpo->tpo_tx_pages);
1999 }
2000
James Simmons06ace262016-02-12 12:06:08 -05002001 if (!tpo->tpo_tx_descs)
Peng Taod7e09d02013-05-02 16:46:55 +08002002 goto out;
2003
2004 for (i = 0; i < pool->po_size; i++) {
2005 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
2006
2007 list_del(&tx->tx_list);
James Simmons06ace262016-02-12 12:06:08 -05002008 if (tx->tx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +08002009 LIBCFS_FREE(tx->tx_pages,
2010 LNET_MAX_IOV *
2011 sizeof(*tx->tx_pages));
James Simmons06ace262016-02-12 12:06:08 -05002012 if (tx->tx_frags)
Peng Taod7e09d02013-05-02 16:46:55 +08002013 LIBCFS_FREE(tx->tx_frags,
James Simmons147280d2016-05-09 10:53:48 -04002014 (1 + IBLND_MAX_RDMA_FRAGS) *
2015 sizeof(*tx->tx_frags));
James Simmons06ace262016-02-12 12:06:08 -05002016 if (tx->tx_wrq)
Peng Taod7e09d02013-05-02 16:46:55 +08002017 LIBCFS_FREE(tx->tx_wrq,
2018 (1 + IBLND_MAX_RDMA_FRAGS) *
2019 sizeof(*tx->tx_wrq));
James Simmons06ace262016-02-12 12:06:08 -05002020 if (tx->tx_sge)
Peng Taod7e09d02013-05-02 16:46:55 +08002021 LIBCFS_FREE(tx->tx_sge,
2022 (1 + IBLND_MAX_RDMA_FRAGS) *
2023 sizeof(*tx->tx_sge));
James Simmons06ace262016-02-12 12:06:08 -05002024 if (tx->tx_rd)
Peng Taod7e09d02013-05-02 16:46:55 +08002025 LIBCFS_FREE(tx->tx_rd,
2026 offsetof(kib_rdma_desc_t,
2027 rd_frags[IBLND_MAX_RDMA_FRAGS]));
2028 }
2029
2030 LIBCFS_FREE(tpo->tpo_tx_descs,
2031 pool->po_size * sizeof(kib_tx_t));
2032out:
2033 kiblnd_fini_pool(pool);
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05002034 LIBCFS_FREE(tpo, sizeof(*tpo));
Peng Taod7e09d02013-05-02 16:46:55 +08002035}
2036
2037static int kiblnd_tx_pool_size(int ncpts)
2038{
2039 int ntx = *kiblnd_tunables.kib_ntx / ncpts;
2040
2041 return max(IBLND_TX_POOL, ntx);
2042}
2043
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002044static int kiblnd_create_tx_pool(kib_poolset_t *ps, int size,
2045 kib_pool_t **pp_po)
Peng Taod7e09d02013-05-02 16:46:55 +08002046{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002047 int i;
2048 int npg;
2049 kib_pool_t *pool;
Peng Taod7e09d02013-05-02 16:46:55 +08002050 kib_tx_pool_t *tpo;
2051
2052 LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
James Simmons06ace262016-02-12 12:06:08 -05002053 if (!tpo) {
Peng Taod7e09d02013-05-02 16:46:55 +08002054 CERROR("Failed to allocate TX pool\n");
2055 return -ENOMEM;
2056 }
2057
2058 pool = &tpo->tpo_pool;
2059 kiblnd_init_pool(ps, pool, size);
2060 tpo->tpo_tx_descs = NULL;
2061 tpo->tpo_tx_pages = NULL;
2062
2063 npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
James Simmons5fd88332016-02-12 12:06:09 -05002064 if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002065 CERROR("Can't allocate tx pages: %d\n", npg);
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05002066 LIBCFS_FREE(tpo, sizeof(*tpo));
Peng Taod7e09d02013-05-02 16:46:55 +08002067 return -ENOMEM;
2068 }
2069
2070 LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
2071 size * sizeof(kib_tx_t));
James Simmons06ace262016-02-12 12:06:08 -05002072 if (!tpo->tpo_tx_descs) {
Peng Taod7e09d02013-05-02 16:46:55 +08002073 CERROR("Can't allocate %d tx descriptors\n", size);
2074 ps->ps_pool_destroy(pool);
2075 return -ENOMEM;
2076 }
2077
2078 memset(tpo->tpo_tx_descs, 0, size * sizeof(kib_tx_t));
2079
2080 for (i = 0; i < size; i++) {
2081 kib_tx_t *tx = &tpo->tpo_tx_descs[i];
2082
2083 tx->tx_pool = tpo;
James Simmons06ace262016-02-12 12:06:08 -05002084 if (ps->ps_net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002085 LIBCFS_CPT_ALLOC(tx->tx_pages,
2086 lnet_cpt_table(), ps->ps_cpt,
2087 LNET_MAX_IOV * sizeof(*tx->tx_pages));
James Simmons06ace262016-02-12 12:06:08 -05002088 if (!tx->tx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +08002089 break;
2090 }
2091
2092 LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
James Simmons147280d2016-05-09 10:53:48 -04002093 (1 + IBLND_MAX_RDMA_FRAGS) *
2094 sizeof(*tx->tx_frags));
James Simmons06ace262016-02-12 12:06:08 -05002095 if (!tx->tx_frags)
Peng Taod7e09d02013-05-02 16:46:55 +08002096 break;
2097
James Simmons147280d2016-05-09 10:53:48 -04002098 sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
Peng Taod7e09d02013-05-02 16:46:55 +08002099
2100 LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
2101 (1 + IBLND_MAX_RDMA_FRAGS) *
2102 sizeof(*tx->tx_wrq));
James Simmons06ace262016-02-12 12:06:08 -05002103 if (!tx->tx_wrq)
Peng Taod7e09d02013-05-02 16:46:55 +08002104 break;
2105
2106 LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
2107 (1 + IBLND_MAX_RDMA_FRAGS) *
2108 sizeof(*tx->tx_sge));
James Simmons06ace262016-02-12 12:06:08 -05002109 if (!tx->tx_sge)
Peng Taod7e09d02013-05-02 16:46:55 +08002110 break;
2111
2112 LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
2113 offsetof(kib_rdma_desc_t,
2114 rd_frags[IBLND_MAX_RDMA_FRAGS]));
James Simmons06ace262016-02-12 12:06:08 -05002115 if (!tx->tx_rd)
Peng Taod7e09d02013-05-02 16:46:55 +08002116 break;
2117 }
2118
2119 if (i == size) {
2120 kiblnd_map_tx_pool(tpo);
2121 *pp_po = pool;
2122 return 0;
2123 }
2124
2125 ps->ps_pool_destroy(pool);
2126 return -ENOMEM;
2127}
2128
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002129static void kiblnd_tx_init(kib_pool_t *pool, struct list_head *node)
Peng Taod7e09d02013-05-02 16:46:55 +08002130{
2131 kib_tx_poolset_t *tps = container_of(pool->po_owner, kib_tx_poolset_t,
2132 tps_poolset);
Mike Shueyec3d17c2015-05-19 10:14:36 -04002133 kib_tx_t *tx = list_entry(node, kib_tx_t, tx_list);
Peng Taod7e09d02013-05-02 16:46:55 +08002134
Igor Ishchenko747327972015-01-12 18:16:26 +02002135 tx->tx_cookie = tps->tps_next_tx_cookie++;
Peng Taod7e09d02013-05-02 16:46:55 +08002136}
2137
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002138static void kiblnd_net_fini_pools(kib_net_t *net)
Peng Taod7e09d02013-05-02 16:46:55 +08002139{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002140 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002141
2142 cfs_cpt_for_each(i, lnet_cpt_table()) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002143 kib_tx_poolset_t *tps;
2144 kib_fmr_poolset_t *fps;
Peng Taod7e09d02013-05-02 16:46:55 +08002145
James Simmons06ace262016-02-12 12:06:08 -05002146 if (net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002147 tps = net->ibn_tx_ps[i];
2148 kiblnd_fini_poolset(&tps->tps_poolset);
2149 }
2150
James Simmons06ace262016-02-12 12:06:08 -05002151 if (net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002152 fps = net->ibn_fmr_ps[i];
2153 kiblnd_fini_fmr_poolset(fps);
2154 }
Peng Taod7e09d02013-05-02 16:46:55 +08002155 }
2156
James Simmons06ace262016-02-12 12:06:08 -05002157 if (net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002158 cfs_percpt_free(net->ibn_tx_ps);
2159 net->ibn_tx_ps = NULL;
2160 }
2161
James Simmons06ace262016-02-12 12:06:08 -05002162 if (net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002163 cfs_percpt_free(net->ibn_fmr_ps);
2164 net->ibn_fmr_ps = NULL;
2165 }
Peng Taod7e09d02013-05-02 16:46:55 +08002166}
2167
Amir Shehata32c8deb82016-05-06 21:30:28 -04002168static int kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts,
2169 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08002170{
Amir Shehata32c8deb82016-05-06 21:30:28 -04002171 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002172 unsigned long flags;
2173 int cpt;
Amir Shehata32c8deb82016-05-06 21:30:28 -04002174 int rc;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002175 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002176
Amir Shehata32c8deb82016-05-06 21:30:28 -04002177 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
2178
Peng Taod7e09d02013-05-02 16:46:55 +08002179 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
Amir Shehata32c8deb82016-05-06 21:30:28 -04002180 if (!tunables->lnd_map_on_demand) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002181 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
Peng Taod7e09d02013-05-02 16:46:55 +08002182 goto create_tx_pool;
2183 }
2184
2185 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2186
Amir Shehata32c8deb82016-05-06 21:30:28 -04002187 if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
Peng Taod7e09d02013-05-02 16:46:55 +08002188 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
Amir Shehata32c8deb82016-05-06 21:30:28 -04002189 tunables->lnd_fmr_pool_size,
Peng Taod7e09d02013-05-02 16:46:55 +08002190 *kiblnd_tunables.kib_ntx / 4);
2191 rc = -EINVAL;
2192 goto failed;
2193 }
2194
Oleg Drokin415bcb52015-08-18 21:04:35 -04002195 /*
2196 * TX pool must be created later than FMR, see LU-2268
2197 * for details
2198 */
James Simmons06ace262016-02-12 12:06:08 -05002199 LASSERT(!net->ibn_tx_ps);
Peng Taod7e09d02013-05-02 16:46:55 +08002200
Oleg Drokin415bcb52015-08-18 21:04:35 -04002201 /*
2202 * premapping can fail if ibd_nmr > 1, so we always create
2203 * FMR pool and map-on-demand if premapping failed
James Simmons7e221b62016-03-24 11:24:02 -04002204 *
2205 * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
2206 * The number of struct kib_fmr_poolsets create is equal to the
2207 * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
Oleg Drokin415bcb52015-08-18 21:04:35 -04002208 */
Peng Taod7e09d02013-05-02 16:46:55 +08002209 net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
2210 sizeof(kib_fmr_poolset_t));
James Simmons06ace262016-02-12 12:06:08 -05002211 if (!net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002212 CERROR("Failed to allocate FMR pool array\n");
2213 rc = -ENOMEM;
2214 goto failed;
2215 }
2216
2217 for (i = 0; i < ncpts; i++) {
James Simmons06ace262016-02-12 12:06:08 -05002218 cpt = !cpts ? i : cpts[i];
Amir Shehata32c8deb82016-05-06 21:30:28 -04002219 rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
2220 net, tunables);
Amir Shehata7cadcc72016-03-02 17:02:03 -05002221 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002222 CERROR("Can't initialize FMR pool for CPT %d: %d\n",
2223 cpt, rc);
2224 goto failed;
2225 }
2226 }
2227
Amir Shehata7cadcc72016-03-02 17:02:03 -05002228 if (i > 0)
Peng Taod7e09d02013-05-02 16:46:55 +08002229 LASSERT(i == ncpts);
Peng Taod7e09d02013-05-02 16:46:55 +08002230
2231 create_tx_pool:
James Simmons7e221b62016-03-24 11:24:02 -04002232 /*
2233 * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
2234 * The number of struct kib_tx_poolsets create is equal to the
2235 * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
2236 */
Peng Taod7e09d02013-05-02 16:46:55 +08002237 net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
2238 sizeof(kib_tx_poolset_t));
James Simmons06ace262016-02-12 12:06:08 -05002239 if (!net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002240 CERROR("Failed to allocate tx pool array\n");
2241 rc = -ENOMEM;
2242 goto failed;
2243 }
2244
2245 for (i = 0; i < ncpts; i++) {
James Simmons06ace262016-02-12 12:06:08 -05002246 cpt = !cpts ? i : cpts[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002247 rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
2248 cpt, net, "TX",
2249 kiblnd_tx_pool_size(ncpts),
2250 kiblnd_create_tx_pool,
2251 kiblnd_destroy_tx_pool,
2252 kiblnd_tx_init, NULL);
James Simmons5fd88332016-02-12 12:06:09 -05002253 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002254 CERROR("Can't initialize TX pool for CPT %d: %d\n",
2255 cpt, rc);
2256 goto failed;
2257 }
2258 }
2259
2260 return 0;
2261 failed:
2262 kiblnd_net_fini_pools(net);
James Simmons5fd88332016-02-12 12:06:09 -05002263 LASSERT(rc);
Peng Taod7e09d02013-05-02 16:46:55 +08002264 return rc;
2265}
2266
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002267static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002268{
James Simmons4420cfd2016-02-12 12:06:00 -05002269 /*
2270 * It's safe to assume a HCA can handle a page size
2271 * matching that of the native system
2272 */
Peng Taod7e09d02013-05-02 16:46:55 +08002273 hdev->ibh_page_shift = PAGE_SHIFT;
2274 hdev->ibh_page_size = 1 << PAGE_SHIFT;
2275 hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
2276
Or Gerlitzcebfe5c2015-12-18 10:59:49 +02002277 hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
Peng Taod7e09d02013-05-02 16:46:55 +08002278 if (hdev->ibh_mr_size == ~0ULL) {
2279 hdev->ibh_mr_shift = 64;
2280 return 0;
2281 }
2282
Greg Kroah-Hartman55f5a822014-07-12 20:26:07 -07002283 CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
Peng Taod7e09d02013-05-02 16:46:55 +08002284 return -EINVAL;
2285}
2286
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002287static void kiblnd_hdev_cleanup_mrs(kib_hca_dev_t *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002288{
Amir Shehata7cadcc72016-03-02 17:02:03 -05002289 if (!hdev->ibh_mrs)
Peng Taod7e09d02013-05-02 16:46:55 +08002290 return;
2291
Amir Shehata7cadcc72016-03-02 17:02:03 -05002292 ib_dereg_mr(hdev->ibh_mrs);
Peng Taod7e09d02013-05-02 16:46:55 +08002293
Amir Shehata7cadcc72016-03-02 17:02:03 -05002294 hdev->ibh_mrs = NULL;
Peng Taod7e09d02013-05-02 16:46:55 +08002295}
2296
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002297void kiblnd_hdev_destroy(kib_hca_dev_t *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002298{
2299 kiblnd_hdev_cleanup_mrs(hdev);
2300
James Simmons06ace262016-02-12 12:06:08 -05002301 if (hdev->ibh_pd)
Peng Taod7e09d02013-05-02 16:46:55 +08002302 ib_dealloc_pd(hdev->ibh_pd);
2303
James Simmons06ace262016-02-12 12:06:08 -05002304 if (hdev->ibh_cmid)
Peng Taod7e09d02013-05-02 16:46:55 +08002305 rdma_destroy_id(hdev->ibh_cmid);
2306
2307 LIBCFS_FREE(hdev, sizeof(*hdev));
2308}
2309
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002310static int kiblnd_hdev_setup_mrs(kib_hca_dev_t *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002311{
2312 struct ib_mr *mr;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002313 int rc;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002314 int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
Peng Taod7e09d02013-05-02 16:46:55 +08002315
2316 rc = kiblnd_hdev_get_attr(hdev);
James Simmons5fd88332016-02-12 12:06:09 -05002317 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002318 return rc;
2319
Luis de Bethencourt01738442015-10-21 18:40:40 +01002320 mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2321 if (IS_ERR(mr)) {
2322 CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2323 kiblnd_hdev_cleanup_mrs(hdev);
2324 return PTR_ERR(mr);
2325 }
Peng Taod7e09d02013-05-02 16:46:55 +08002326
Amir Shehata7cadcc72016-03-02 17:02:03 -05002327 hdev->ibh_mrs = mr;
Peng Taod7e09d02013-05-02 16:46:55 +08002328
Peng Taod7e09d02013-05-02 16:46:55 +08002329 return 0;
2330}
2331
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002332/* DUMMY */
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002333static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
2334 struct rdma_cm_event *event)
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002335{
Peng Taod7e09d02013-05-02 16:46:55 +08002336 return 0;
2337}
2338
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002339static int kiblnd_dev_need_failover(kib_dev_t *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002340{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002341 struct rdma_cm_id *cmid;
2342 struct sockaddr_in srcaddr;
2343 struct sockaddr_in dstaddr;
2344 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002345
James Simmons06ace262016-02-12 12:06:08 -05002346 if (!dev->ibd_hdev || /* initializing */
2347 !dev->ibd_hdev->ibh_cmid || /* listener is dead */
Peng Taod7e09d02013-05-02 16:46:55 +08002348 *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2349 return 1;
2350
James Simmons4420cfd2016-02-12 12:06:00 -05002351 /*
2352 * XXX: it's UGLY, but I don't have better way to find
Peng Taod7e09d02013-05-02 16:46:55 +08002353 * ib-bonding HCA failover because:
2354 *
2355 * a. no reliable CM event for HCA failover...
2356 * b. no OFED API to get ib_device for current net_device...
2357 *
2358 * We have only two choices at this point:
2359 *
2360 * a. rdma_bind_addr(), it will conflict with listener cmid
James Simmons4420cfd2016-02-12 12:06:00 -05002361 * b. rdma_resolve_addr() to zero addr
2362 */
Peng Taod7e09d02013-05-02 16:46:55 +08002363 cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2364 IB_QPT_RC);
2365 if (IS_ERR(cmid)) {
2366 rc = PTR_ERR(cmid);
2367 CERROR("Failed to create cmid for failover: %d\n", rc);
2368 return rc;
2369 }
2370
2371 memset(&srcaddr, 0, sizeof(srcaddr));
Mike Shueyec3d17c2015-05-19 10:14:36 -04002372 srcaddr.sin_family = AF_INET;
Peng Taod7e09d02013-05-02 16:46:55 +08002373 srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2374
2375 memset(&dstaddr, 0, sizeof(dstaddr));
2376 dstaddr.sin_family = AF_INET;
2377 rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2378 (struct sockaddr *)&dstaddr, 1);
James Simmons5fd88332016-02-12 12:06:09 -05002379 if (rc || !cmid->device) {
Peng Tao5e8f6922013-07-15 22:27:09 +08002380 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2381 dev->ibd_ifname, &dev->ibd_ifip,
Peng Taod7e09d02013-05-02 16:46:55 +08002382 cmid->device, rc);
2383 rdma_destroy_id(cmid);
2384 return rc;
2385 }
2386
Liang Zhen199a0cc2015-09-14 18:41:33 -04002387 rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
2388 rdma_destroy_id(cmid);
Peng Taod7e09d02013-05-02 16:46:55 +08002389
Liang Zhen199a0cc2015-09-14 18:41:33 -04002390 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002391}
2392
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002393int kiblnd_dev_failover(kib_dev_t *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002394{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002395 LIST_HEAD(zombie_tpo);
2396 LIST_HEAD(zombie_ppo);
2397 LIST_HEAD(zombie_fpo);
Mike Shueyec3d17c2015-05-19 10:14:36 -04002398 struct rdma_cm_id *cmid = NULL;
2399 kib_hca_dev_t *hdev = NULL;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002400 struct ib_pd *pd;
2401 kib_net_t *net;
2402 struct sockaddr_in addr;
2403 unsigned long flags;
2404 int rc = 0;
2405 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002406
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002407 LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
James Simmons06ace262016-02-12 12:06:08 -05002408 dev->ibd_can_failover || !dev->ibd_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08002409
2410 rc = kiblnd_dev_need_failover(dev);
2411 if (rc <= 0)
2412 goto out;
2413
James Simmons06ace262016-02-12 12:06:08 -05002414 if (dev->ibd_hdev &&
2415 dev->ibd_hdev->ibh_cmid) {
James Simmons4420cfd2016-02-12 12:06:00 -05002416 /*
2417 * XXX it's not good to close old listener at here,
Peng Taod7e09d02013-05-02 16:46:55 +08002418 * because we can fail to create new listener.
2419 * But we have to close it now, otherwise rdma_bind_addr
James Simmons4420cfd2016-02-12 12:06:00 -05002420 * will return EADDRINUSE... How crap!
2421 */
Peng Taod7e09d02013-05-02 16:46:55 +08002422 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2423
2424 cmid = dev->ibd_hdev->ibh_cmid;
James Simmons4420cfd2016-02-12 12:06:00 -05002425 /*
2426 * make next schedule of kiblnd_dev_need_failover()
2427 * return 1 for me
2428 */
Peng Taod7e09d02013-05-02 16:46:55 +08002429 dev->ibd_hdev->ibh_cmid = NULL;
2430 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2431
2432 rdma_destroy_id(cmid);
2433 }
2434
2435 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2436 IB_QPT_RC);
2437 if (IS_ERR(cmid)) {
2438 rc = PTR_ERR(cmid);
2439 CERROR("Failed to create cmid for failover: %d\n", rc);
2440 goto out;
2441 }
2442
2443 memset(&addr, 0, sizeof(addr));
2444 addr.sin_family = AF_INET;
2445 addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2446 addr.sin_port = htons(*kiblnd_tunables.kib_service);
2447
2448 /* Bind to failover device or port */
2449 rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
James Simmons5fd88332016-02-12 12:06:09 -05002450 if (rc || !cmid->device) {
Peng Tao5e8f6922013-07-15 22:27:09 +08002451 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2452 dev->ibd_ifname, &dev->ibd_ifip,
Peng Taod7e09d02013-05-02 16:46:55 +08002453 cmid->device, rc);
2454 rdma_destroy_id(cmid);
2455 goto out;
2456 }
2457
2458 LIBCFS_ALLOC(hdev, sizeof(*hdev));
James Simmons06ace262016-02-12 12:06:08 -05002459 if (!hdev) {
Peng Taod7e09d02013-05-02 16:46:55 +08002460 CERROR("Failed to allocate kib_hca_dev\n");
2461 rdma_destroy_id(cmid);
2462 rc = -ENOMEM;
2463 goto out;
2464 }
2465
2466 atomic_set(&hdev->ibh_ref, 1);
2467 hdev->ibh_dev = dev;
2468 hdev->ibh_cmid = cmid;
2469 hdev->ibh_ibdev = cmid->device;
2470
2471 pd = ib_alloc_pd(cmid->device);
2472 if (IS_ERR(pd)) {
2473 rc = PTR_ERR(pd);
2474 CERROR("Can't allocate PD: %d\n", rc);
2475 goto out;
2476 }
2477
2478 hdev->ibh_pd = pd;
2479
2480 rc = rdma_listen(cmid, 0);
James Simmons5fd88332016-02-12 12:06:09 -05002481 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002482 CERROR("Can't start new listener: %d\n", rc);
2483 goto out;
2484 }
2485
2486 rc = kiblnd_hdev_setup_mrs(hdev);
James Simmons5fd88332016-02-12 12:06:09 -05002487 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002488 CERROR("Can't setup device: %d\n", rc);
2489 goto out;
2490 }
2491
2492 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2493
Fabian Frederick6d37b172015-06-10 18:32:21 +02002494 swap(dev->ibd_hdev, hdev); /* take over the refcount */
Peng Taod7e09d02013-05-02 16:46:55 +08002495
2496 list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2497 cfs_cpt_for_each(i, lnet_cpt_table()) {
2498 kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
2499 &zombie_tpo);
2500
Oleg Drokin415bcb52015-08-18 21:04:35 -04002501 if (net->ibn_fmr_ps)
Peng Taod7e09d02013-05-02 16:46:55 +08002502 kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
2503 &zombie_fpo);
Peng Taod7e09d02013-05-02 16:46:55 +08002504 }
2505 }
2506
2507 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2508 out:
2509 if (!list_empty(&zombie_tpo))
2510 kiblnd_destroy_pool_list(&zombie_tpo);
2511 if (!list_empty(&zombie_ppo))
2512 kiblnd_destroy_pool_list(&zombie_ppo);
2513 if (!list_empty(&zombie_fpo))
2514 kiblnd_destroy_fmr_pool_list(&zombie_fpo);
James Simmons06ace262016-02-12 12:06:08 -05002515 if (hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002516 kiblnd_hdev_decref(hdev);
2517
James Simmons5fd88332016-02-12 12:06:09 -05002518 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002519 dev->ibd_failed_failover++;
2520 else
2521 dev->ibd_failed_failover = 0;
2522
2523 return rc;
2524}
2525
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002526void kiblnd_destroy_dev(kib_dev_t *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002527{
James Simmons5fd88332016-02-12 12:06:09 -05002528 LASSERT(!dev->ibd_nnets);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002529 LASSERT(list_empty(&dev->ibd_nets));
Peng Taod7e09d02013-05-02 16:46:55 +08002530
2531 list_del(&dev->ibd_fail_list);
2532 list_del(&dev->ibd_list);
2533
James Simmons06ace262016-02-12 12:06:08 -05002534 if (dev->ibd_hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002535 kiblnd_hdev_decref(dev->ibd_hdev);
2536
2537 LIBCFS_FREE(dev, sizeof(*dev));
2538}
2539
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002540static kib_dev_t *kiblnd_create_dev(char *ifname)
Peng Taod7e09d02013-05-02 16:46:55 +08002541{
2542 struct net_device *netdev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002543 kib_dev_t *dev;
2544 __u32 netmask;
2545 __u32 ip;
2546 int up;
2547 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002548
James Simmons1ad6a732015-06-08 22:27:10 -04002549 rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
James Simmons5fd88332016-02-12 12:06:09 -05002550 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002551 CERROR("Can't query IPoIB interface %s: %d\n",
2552 ifname, rc);
2553 return NULL;
2554 }
2555
2556 if (!up) {
2557 CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2558 return NULL;
2559 }
2560
2561 LIBCFS_ALLOC(dev, sizeof(*dev));
James Simmons06ace262016-02-12 12:06:08 -05002562 if (!dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002563 return NULL;
2564
Peng Taod7e09d02013-05-02 16:46:55 +08002565 netdev = dev_get_by_name(&init_net, ifname);
James Simmons06ace262016-02-12 12:06:08 -05002566 if (!netdev) {
Peng Taod7e09d02013-05-02 16:46:55 +08002567 dev->ibd_can_failover = 0;
2568 } else {
2569 dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2570 dev_put(netdev);
2571 }
2572
2573 INIT_LIST_HEAD(&dev->ibd_nets);
2574 INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2575 INIT_LIST_HEAD(&dev->ibd_fail_list);
2576 dev->ibd_ifip = ip;
2577 strcpy(&dev->ibd_ifname[0], ifname);
2578
2579 /* initialize the device */
2580 rc = kiblnd_dev_failover(dev);
James Simmons5fd88332016-02-12 12:06:09 -05002581 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002582 CERROR("Can't initialize device: %d\n", rc);
2583 LIBCFS_FREE(dev, sizeof(*dev));
2584 return NULL;
2585 }
2586
James Simmonsc314c312016-02-12 12:06:01 -05002587 list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
Peng Taod7e09d02013-05-02 16:46:55 +08002588 return dev;
2589}
2590
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002591static void kiblnd_base_shutdown(void)
Peng Taod7e09d02013-05-02 16:46:55 +08002592{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002593 struct kib_sched_info *sched;
2594 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002595
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002596 LASSERT(list_empty(&kiblnd_data.kib_devs));
Peng Taod7e09d02013-05-02 16:46:55 +08002597
Peng Taod7e09d02013-05-02 16:46:55 +08002598 switch (kiblnd_data.kib_init) {
2599 default:
2600 LBUG();
2601
2602 case IBLND_INIT_ALL:
2603 case IBLND_INIT_DATA:
James Simmons06ace262016-02-12 12:06:08 -05002604 LASSERT(kiblnd_data.kib_peers);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002605 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002606 LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002607 LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
2608 LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
Liang Zhen4d99b252016-03-02 18:53:29 -05002609 LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
2610 LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
Peng Taod7e09d02013-05-02 16:46:55 +08002611
2612 /* flag threads to terminate; wake and wait for them to die */
2613 kiblnd_data.kib_shutdown = 1;
2614
James Simmons4420cfd2016-02-12 12:06:00 -05002615 /*
2616 * NB: we really want to stop scheduler threads net by net
Peng Taod7e09d02013-05-02 16:46:55 +08002617 * instead of the whole module, this should be improved
James Simmons4420cfd2016-02-12 12:06:00 -05002618 * with dynamic configuration LNet
2619 */
Peng Taod7e09d02013-05-02 16:46:55 +08002620 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
2621 wake_up_all(&sched->ibs_waitq);
2622
2623 wake_up_all(&kiblnd_data.kib_connd_waitq);
2624 wake_up_all(&kiblnd_data.kib_failover_waitq);
2625
2626 i = 2;
James Simmons5fd88332016-02-12 12:06:09 -05002627 while (atomic_read(&kiblnd_data.kib_nthreads)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002628 i++;
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002629 /* power of 2 ? */
2630 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
Peng Taod7e09d02013-05-02 16:46:55 +08002631 "Waiting for %d threads to terminate\n",
2632 atomic_read(&kiblnd_data.kib_nthreads));
Peng Taod3caf4d2014-03-18 21:05:56 +08002633 set_current_state(TASK_UNINTERRUPTIBLE);
2634 schedule_timeout(cfs_time_seconds(1));
Peng Taod7e09d02013-05-02 16:46:55 +08002635 }
2636
2637 /* fall through */
2638
2639 case IBLND_INIT_NOTHING:
2640 break;
2641 }
2642
James Simmons06ace262016-02-12 12:06:08 -05002643 if (kiblnd_data.kib_peers) {
Peng Taod7e09d02013-05-02 16:46:55 +08002644 LIBCFS_FREE(kiblnd_data.kib_peers,
2645 sizeof(struct list_head) *
2646 kiblnd_data.kib_peer_hash_size);
2647 }
2648
James Simmons06ace262016-02-12 12:06:08 -05002649 if (kiblnd_data.kib_scheds)
Peng Taod7e09d02013-05-02 16:46:55 +08002650 cfs_percpt_free(kiblnd_data.kib_scheds);
2651
Peng Taod7e09d02013-05-02 16:46:55 +08002652 kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2653 module_put(THIS_MODULE);
2654}
2655
Frank Zago439b4d42016-03-02 17:02:00 -05002656static void kiblnd_shutdown(lnet_ni_t *ni)
Peng Taod7e09d02013-05-02 16:46:55 +08002657{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002658 kib_net_t *net = ni->ni_data;
2659 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2660 int i;
2661 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +08002662
2663 LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2664
James Simmons06ace262016-02-12 12:06:08 -05002665 if (!net)
Peng Taod7e09d02013-05-02 16:46:55 +08002666 goto out;
2667
Peng Taod7e09d02013-05-02 16:46:55 +08002668 write_lock_irqsave(g_lock, flags);
2669 net->ibn_shutdown = 1;
2670 write_unlock_irqrestore(g_lock, flags);
2671
2672 switch (net->ibn_init) {
2673 default:
2674 LBUG();
2675
2676 case IBLND_INIT_ALL:
2677 /* nuke all existing peers within this net */
2678 kiblnd_del_peer(ni, LNET_NID_ANY);
2679
2680 /* Wait for all peer state to clean up */
2681 i = 2;
James Simmons5fd88332016-02-12 12:06:09 -05002682 while (atomic_read(&net->ibn_npeers)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002683 i++;
2684 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2685 "%s: waiting for %d peers to disconnect\n",
2686 libcfs_nid2str(ni->ni_nid),
2687 atomic_read(&net->ibn_npeers));
Peng Taod3caf4d2014-03-18 21:05:56 +08002688 set_current_state(TASK_UNINTERRUPTIBLE);
2689 schedule_timeout(cfs_time_seconds(1));
Peng Taod7e09d02013-05-02 16:46:55 +08002690 }
2691
2692 kiblnd_net_fini_pools(net);
2693
2694 write_lock_irqsave(g_lock, flags);
2695 LASSERT(net->ibn_dev->ibd_nnets > 0);
2696 net->ibn_dev->ibd_nnets--;
2697 list_del(&net->ibn_list);
2698 write_unlock_irqrestore(g_lock, flags);
2699
2700 /* fall through */
2701
2702 case IBLND_INIT_NOTHING:
James Simmons5fd88332016-02-12 12:06:09 -05002703 LASSERT(!atomic_read(&net->ibn_nconns));
Peng Taod7e09d02013-05-02 16:46:55 +08002704
James Simmons5fd88332016-02-12 12:06:09 -05002705 if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
Peng Taod7e09d02013-05-02 16:46:55 +08002706 kiblnd_destroy_dev(net->ibn_dev);
2707
2708 break;
2709 }
2710
Peng Taod7e09d02013-05-02 16:46:55 +08002711 net->ibn_init = IBLND_INIT_NOTHING;
2712 ni->ni_data = NULL;
2713
2714 LIBCFS_FREE(net, sizeof(*net));
2715
2716out:
2717 if (list_empty(&kiblnd_data.kib_devs))
2718 kiblnd_base_shutdown();
Peng Taod7e09d02013-05-02 16:46:55 +08002719}
2720
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002721static int kiblnd_base_startup(void)
Peng Taod7e09d02013-05-02 16:46:55 +08002722{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002723 struct kib_sched_info *sched;
2724 int rc;
2725 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002726
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002727 LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
Peng Taod7e09d02013-05-02 16:46:55 +08002728
2729 try_module_get(THIS_MODULE);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002730 /* zero pointers, flags etc */
2731 memset(&kiblnd_data, 0, sizeof(kiblnd_data));
Peng Taod7e09d02013-05-02 16:46:55 +08002732
2733 rwlock_init(&kiblnd_data.kib_global_lock);
2734
2735 INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2736 INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2737
2738 kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2739 LIBCFS_ALLOC(kiblnd_data.kib_peers,
Mike Shueyec3d17c2015-05-19 10:14:36 -04002740 sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size);
James Simmons06ace262016-02-12 12:06:08 -05002741 if (!kiblnd_data.kib_peers)
Peng Taod7e09d02013-05-02 16:46:55 +08002742 goto failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002743 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2744 INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2745
2746 spin_lock_init(&kiblnd_data.kib_connd_lock);
2747 INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2748 INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
Liang Zhen4d99b252016-03-02 18:53:29 -05002749 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
2750 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
2751
Peng Taod7e09d02013-05-02 16:46:55 +08002752 init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
2753 init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
2754
2755 kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
2756 sizeof(*sched));
James Simmons06ace262016-02-12 12:06:08 -05002757 if (!kiblnd_data.kib_scheds)
Peng Taod7e09d02013-05-02 16:46:55 +08002758 goto failed;
2759
2760 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002761 int nthrs;
Peng Taod7e09d02013-05-02 16:46:55 +08002762
2763 spin_lock_init(&sched->ibs_lock);
2764 INIT_LIST_HEAD(&sched->ibs_conns);
2765 init_waitqueue_head(&sched->ibs_waitq);
2766
2767 nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
2768 if (*kiblnd_tunables.kib_nscheds > 0) {
2769 nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
2770 } else {
James Simmons4420cfd2016-02-12 12:06:00 -05002771 /*
2772 * max to half of CPUs, another half is reserved for
2773 * upper layer modules
2774 */
Peng Taod7e09d02013-05-02 16:46:55 +08002775 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2776 }
2777
2778 sched->ibs_nthreads_max = nthrs;
2779 sched->ibs_cpt = i;
2780 }
2781
2782 kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
2783
2784 /* lists/ptrs/locks initialised */
2785 kiblnd_data.kib_init = IBLND_INIT_DATA;
2786 /*****************************************************/
2787
2788 rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
James Simmons5fd88332016-02-12 12:06:09 -05002789 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002790 CERROR("Can't spawn o2iblnd connd: %d\n", rc);
2791 goto failed;
2792 }
2793
James Simmons5fd88332016-02-12 12:06:09 -05002794 if (*kiblnd_tunables.kib_dev_failover)
Peng Taod7e09d02013-05-02 16:46:55 +08002795 rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
2796 "kiblnd_failover");
2797
James Simmons5fd88332016-02-12 12:06:09 -05002798 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002799 CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
2800 goto failed;
2801 }
2802
2803 /* flag everything initialised */
2804 kiblnd_data.kib_init = IBLND_INIT_ALL;
2805 /*****************************************************/
2806
2807 return 0;
2808
2809 failed:
2810 kiblnd_base_shutdown();
2811 return -ENETDOWN;
2812}
2813
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002814static int kiblnd_start_schedulers(struct kib_sched_info *sched)
Peng Taod7e09d02013-05-02 16:46:55 +08002815{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002816 int rc = 0;
2817 int nthrs;
2818 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002819
James Simmons5fd88332016-02-12 12:06:09 -05002820 if (!sched->ibs_nthreads) {
Peng Taod7e09d02013-05-02 16:46:55 +08002821 if (*kiblnd_tunables.kib_nscheds > 0) {
2822 nthrs = sched->ibs_nthreads_max;
2823 } else {
2824 nthrs = cfs_cpt_weight(lnet_cpt_table(),
2825 sched->ibs_cpt);
2826 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2827 nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
2828 }
2829 } else {
2830 LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
2831 /* increase one thread if there is new interface */
Haneen Mohammedb6ee3822015-03-13 20:48:53 +03002832 nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
Peng Taod7e09d02013-05-02 16:46:55 +08002833 }
2834
2835 for (i = 0; i < nthrs; i++) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002836 long id;
2837 char name[20];
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002838
Peng Taod7e09d02013-05-02 16:46:55 +08002839 id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
2840 snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
2841 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
2842 rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
James Simmons5fd88332016-02-12 12:06:09 -05002843 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002844 continue;
2845
2846 CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
2847 sched->ibs_cpt, sched->ibs_nthreads + i, rc);
2848 break;
2849 }
2850
2851 sched->ibs_nthreads += i;
2852 return rc;
2853}
2854
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002855static int kiblnd_dev_start_threads(kib_dev_t *dev, int newdev, __u32 *cpts,
2856 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08002857{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002858 int cpt;
2859 int rc;
2860 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002861
2862 for (i = 0; i < ncpts; i++) {
2863 struct kib_sched_info *sched;
2864
James Simmons06ace262016-02-12 12:06:08 -05002865 cpt = !cpts ? i : cpts[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002866 sched = kiblnd_data.kib_scheds[cpt];
2867
2868 if (!newdev && sched->ibs_nthreads > 0)
2869 continue;
2870
2871 rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
James Simmons5fd88332016-02-12 12:06:09 -05002872 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002873 CERROR("Failed to start scheduler threads for %s\n",
2874 dev->ibd_ifname);
2875 return rc;
2876 }
2877 }
2878 return 0;
2879}
2880
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002881static kib_dev_t *kiblnd_dev_search(char *ifname)
Peng Taod7e09d02013-05-02 16:46:55 +08002882{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002883 kib_dev_t *alias = NULL;
2884 kib_dev_t *dev;
2885 char *colon;
2886 char *colon2;
Peng Taod7e09d02013-05-02 16:46:55 +08002887
2888 colon = strchr(ifname, ':');
2889 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
James Simmons5fd88332016-02-12 12:06:09 -05002890 if (!strcmp(&dev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002891 return dev;
2892
James Simmons06ace262016-02-12 12:06:08 -05002893 if (alias)
Peng Taod7e09d02013-05-02 16:46:55 +08002894 continue;
2895
2896 colon2 = strchr(dev->ibd_ifname, ':');
James Simmons06ace262016-02-12 12:06:08 -05002897 if (colon)
Peng Taod7e09d02013-05-02 16:46:55 +08002898 *colon = 0;
James Simmons06ace262016-02-12 12:06:08 -05002899 if (colon2)
Peng Taod7e09d02013-05-02 16:46:55 +08002900 *colon2 = 0;
2901
James Simmons5fd88332016-02-12 12:06:09 -05002902 if (!strcmp(&dev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002903 alias = dev;
2904
James Simmons06ace262016-02-12 12:06:08 -05002905 if (colon)
Peng Taod7e09d02013-05-02 16:46:55 +08002906 *colon = ':';
James Simmons06ace262016-02-12 12:06:08 -05002907 if (colon2)
Peng Taod7e09d02013-05-02 16:46:55 +08002908 *colon2 = ':';
2909 }
2910 return alias;
2911}
2912
Frank Zago439b4d42016-03-02 17:02:00 -05002913static int kiblnd_startup(lnet_ni_t *ni)
Peng Taod7e09d02013-05-02 16:46:55 +08002914{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002915 char *ifname;
2916 kib_dev_t *ibdev = NULL;
2917 kib_net_t *net;
Arnd Bergmann473c4e02015-09-27 16:45:13 -04002918 struct timespec64 tv;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002919 unsigned long flags;
2920 int rc;
2921 int newdev;
Peng Taod7e09d02013-05-02 16:46:55 +08002922
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002923 LASSERT(ni->ni_lnd == &the_o2iblnd);
Peng Taod7e09d02013-05-02 16:46:55 +08002924
2925 if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
2926 rc = kiblnd_base_startup();
James Simmons5fd88332016-02-12 12:06:09 -05002927 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002928 return rc;
2929 }
2930
2931 LIBCFS_ALLOC(net, sizeof(*net));
2932 ni->ni_data = net;
James Simmons06ace262016-02-12 12:06:08 -05002933 if (!net)
Jiayi Ye3247c4e2014-10-25 11:40:32 +08002934 goto net_failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002935
Arnd Bergmann473c4e02015-09-27 16:45:13 -04002936 ktime_get_real_ts64(&tv);
2937 net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
2938 tv.tv_nsec / NSEC_PER_USEC;
Peng Taod7e09d02013-05-02 16:46:55 +08002939
Amir Shehataf6e50062016-05-06 21:30:27 -04002940 rc = kiblnd_tunables_setup(ni);
Amir Shehata025ba822016-05-06 21:30:26 -04002941 if (rc)
2942 goto net_failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002943
James Simmons06ace262016-02-12 12:06:08 -05002944 if (ni->ni_interfaces[0]) {
Peng Taod7e09d02013-05-02 16:46:55 +08002945 /* Use the IPoIB interface specified in 'networks=' */
2946
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002947 CLASSERT(LNET_MAX_INTERFACES > 1);
James Simmons06ace262016-02-12 12:06:08 -05002948 if (ni->ni_interfaces[1]) {
Peng Taod7e09d02013-05-02 16:46:55 +08002949 CERROR("Multiple interfaces not supported\n");
2950 goto failed;
2951 }
2952
2953 ifname = ni->ni_interfaces[0];
2954 } else {
2955 ifname = *kiblnd_tunables.kib_default_ipif;
2956 }
2957
2958 if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
2959 CERROR("IPoIB interface name too long: %s\n", ifname);
2960 goto failed;
2961 }
2962
2963 ibdev = kiblnd_dev_search(ifname);
2964
James Simmons06ace262016-02-12 12:06:08 -05002965 newdev = !ibdev;
Peng Taod7e09d02013-05-02 16:46:55 +08002966 /* hmm...create kib_dev even for alias */
James Simmons5fd88332016-02-12 12:06:09 -05002967 if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002968 ibdev = kiblnd_create_dev(ifname);
2969
James Simmons06ace262016-02-12 12:06:08 -05002970 if (!ibdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002971 goto failed;
2972
2973 net->ibn_dev = ibdev;
2974 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
2975
2976 rc = kiblnd_dev_start_threads(ibdev, newdev,
2977 ni->ni_cpts, ni->ni_ncpts);
James Simmons5fd88332016-02-12 12:06:09 -05002978 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002979 goto failed;
2980
Amir Shehata32c8deb82016-05-06 21:30:28 -04002981 rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
James Simmons5fd88332016-02-12 12:06:09 -05002982 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002983 CERROR("Failed to initialize NI pools: %d\n", rc);
2984 goto failed;
2985 }
2986
2987 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2988 ibdev->ibd_nnets++;
2989 list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
2990 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2991
2992 net->ibn_init = IBLND_INIT_ALL;
2993
2994 return 0;
2995
2996failed:
James Simmons06ace262016-02-12 12:06:08 -05002997 if (!net->ibn_dev && ibdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002998 kiblnd_destroy_dev(ibdev);
2999
Jiayi Ye3247c4e2014-10-25 11:40:32 +08003000net_failed:
Peng Taod7e09d02013-05-02 16:46:55 +08003001 kiblnd_shutdown(ni);
3002
3003 CDEBUG(D_NET, "kiblnd_startup failed\n");
3004 return -ENETDOWN;
3005}
3006
Frank Zago439b4d42016-03-02 17:02:00 -05003007static lnd_t the_o2iblnd = {
3008 .lnd_type = O2IBLND,
3009 .lnd_startup = kiblnd_startup,
3010 .lnd_shutdown = kiblnd_shutdown,
3011 .lnd_ctl = kiblnd_ctl,
3012 .lnd_query = kiblnd_query,
3013 .lnd_send = kiblnd_send,
3014 .lnd_recv = kiblnd_recv,
3015};
3016
Andreas Dilgere0f94112016-02-26 11:36:05 -05003017static void __exit ko2iblnd_exit(void)
Peng Taod7e09d02013-05-02 16:46:55 +08003018{
3019 lnet_unregister_lnd(&the_o2iblnd);
Peng Taod7e09d02013-05-02 16:46:55 +08003020}
3021
Andreas Dilgere0f94112016-02-26 11:36:05 -05003022static int __init ko2iblnd_init(void)
Peng Taod7e09d02013-05-02 16:46:55 +08003023{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02003024 CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02003025 CLASSERT(offsetof(kib_msg_t,
James Simmonsc314c312016-02-12 12:06:01 -05003026 ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3027 <= IBLND_MSG_SIZE);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02003028 CLASSERT(offsetof(kib_msg_t,
James Simmonsc314c312016-02-12 12:06:01 -05003029 ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3030 <= IBLND_MSG_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08003031
Amir Shehata025ba822016-05-06 21:30:26 -04003032 kiblnd_tunables_init();
Peng Taod7e09d02013-05-02 16:46:55 +08003033
3034 lnet_register_lnd(&the_o2iblnd);
3035
3036 return 0;
3037}
3038
James Simmonsa0455472015-11-04 13:40:02 -05003039MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
Andreas Dilger57878e12016-02-26 11:36:04 -05003040MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
James Simmons5b0e50b2016-02-26 11:36:03 -05003041MODULE_VERSION("2.7.0");
Peng Taod7e09d02013-05-02 16:46:55 +08003042MODULE_LICENSE("GPL");
3043
Andreas Dilgere0f94112016-02-26 11:36:05 -05003044module_init(ko2iblnd_init);
3045module_exit(ko2iblnd_exit);