blob: 9e8802181452831c52b9a87428ccec02c353a806 [file] [log] [blame]
Peng Taod7e09d02013-05-02 16:46:55 +08001/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
Oleg Drokin6a5b99a2016-06-14 23:33:40 -040018 * http://www.gnu.org/licenses/gpl-2.0.html
Peng Taod7e09d02013-05-02 16:46:55 +080019 *
Peng Taod7e09d02013-05-02 16:46:55 +080020 * GPL HEADER END
21 */
22/*
23 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
25 *
Andreas Dilger1dc563a2015-11-08 18:09:37 -050026 * Copyright (c) 2011, 2015, Intel Corporation.
Peng Taod7e09d02013-05-02 16:46:55 +080027 */
28/*
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
31 *
32 * lnet/klnds/o2iblnd/o2iblnd.c
33 *
34 * Author: Eric Barton <eric@bartonsoftware.com>
35 */
36
Peng Tao5f432642013-06-07 22:07:21 +080037#include <asm/div64.h>
John L. Hammondd664d1f2015-06-11 15:18:08 -040038#include <asm/page.h>
39#include "o2iblnd.h"
Peng Taod7e09d02013-05-02 16:46:55 +080040
Frank Zago439b4d42016-03-02 17:02:00 -050041static lnd_t the_o2iblnd;
Peng Taod7e09d02013-05-02 16:46:55 +080042
James Simmons8d9de3f2016-06-10 16:13:39 -040043struct kib_data kiblnd_data;
Peng Taod7e09d02013-05-02 16:46:55 +080044
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020045static __u32 kiblnd_cksum(void *ptr, int nob)
Peng Taod7e09d02013-05-02 16:46:55 +080046{
Mike Shueyec3d17c2015-05-19 10:14:36 -040047 char *c = ptr;
48 __u32 sum = 0;
Peng Taod7e09d02013-05-02 16:46:55 +080049
50 while (nob-- > 0)
51 sum = ((sum << 1) | (sum >> 31)) + *c++;
52
53 /* ensure I don't return 0 (== no checksum) */
James Simmons5fd88332016-02-12 12:06:09 -050054 return !sum ? 1 : sum;
Peng Taod7e09d02013-05-02 16:46:55 +080055}
56
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020057static char *kiblnd_msgtype2str(int type)
Peng Taod7e09d02013-05-02 16:46:55 +080058{
59 switch (type) {
60 case IBLND_MSG_CONNREQ:
61 return "CONNREQ";
62
63 case IBLND_MSG_CONNACK:
64 return "CONNACK";
65
66 case IBLND_MSG_NOOP:
67 return "NOOP";
68
69 case IBLND_MSG_IMMEDIATE:
70 return "IMMEDIATE";
71
72 case IBLND_MSG_PUT_REQ:
73 return "PUT_REQ";
74
75 case IBLND_MSG_PUT_NAK:
76 return "PUT_NAK";
77
78 case IBLND_MSG_PUT_ACK:
79 return "PUT_ACK";
80
81 case IBLND_MSG_PUT_DONE:
82 return "PUT_DONE";
83
84 case IBLND_MSG_GET_REQ:
85 return "GET_REQ";
86
87 case IBLND_MSG_GET_DONE:
88 return "GET_DONE";
89
90 default:
91 return "???";
92 }
93}
94
Guillaume Matheronfebe73b2015-04-02 19:35:45 +020095static int kiblnd_msgtype2size(int type)
Peng Taod7e09d02013-05-02 16:46:55 +080096{
James Simmons8d9de3f2016-06-10 16:13:39 -040097 const int hdr_size = offsetof(struct kib_msg, ibm_u);
Peng Taod7e09d02013-05-02 16:46:55 +080098
99 switch (type) {
100 case IBLND_MSG_CONNREQ:
101 case IBLND_MSG_CONNACK:
James Simmons8d9de3f2016-06-10 16:13:39 -0400102 return hdr_size + sizeof(struct kib_connparams);
Peng Taod7e09d02013-05-02 16:46:55 +0800103
104 case IBLND_MSG_NOOP:
105 return hdr_size;
106
107 case IBLND_MSG_IMMEDIATE:
James Simmons8d9de3f2016-06-10 16:13:39 -0400108 return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]);
Peng Taod7e09d02013-05-02 16:46:55 +0800109
110 case IBLND_MSG_PUT_REQ:
James Simmons8d9de3f2016-06-10 16:13:39 -0400111 return hdr_size + sizeof(struct kib_putreq_msg);
Peng Taod7e09d02013-05-02 16:46:55 +0800112
113 case IBLND_MSG_PUT_ACK:
James Simmons8d9de3f2016-06-10 16:13:39 -0400114 return hdr_size + sizeof(struct kib_putack_msg);
Peng Taod7e09d02013-05-02 16:46:55 +0800115
116 case IBLND_MSG_GET_REQ:
James Simmons8d9de3f2016-06-10 16:13:39 -0400117 return hdr_size + sizeof(struct kib_get_msg);
Peng Taod7e09d02013-05-02 16:46:55 +0800118
119 case IBLND_MSG_PUT_NAK:
120 case IBLND_MSG_PUT_DONE:
121 case IBLND_MSG_GET_DONE:
James Simmons8d9de3f2016-06-10 16:13:39 -0400122 return hdr_size + sizeof(struct kib_completion_msg);
Peng Taod7e09d02013-05-02 16:46:55 +0800123 default:
124 return -1;
125 }
126}
127
James Simmons8d9de3f2016-06-10 16:13:39 -0400128static int kiblnd_unpack_rd(struct kib_msg *msg, int flip)
Peng Taod7e09d02013-05-02 16:46:55 +0800129{
James Simmons8d9de3f2016-06-10 16:13:39 -0400130 struct kib_rdma_desc *rd;
James Simmonsbbc2d822016-08-24 11:11:58 -0400131 int msg_size;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400132 int nob;
133 int n;
134 int i;
Peng Taod7e09d02013-05-02 16:46:55 +0800135
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200136 LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ ||
James Simmonsc314c312016-02-12 12:06:01 -0500137 msg->ibm_type == IBLND_MSG_PUT_ACK);
Peng Taod7e09d02013-05-02 16:46:55 +0800138
139 rd = msg->ibm_type == IBLND_MSG_GET_REQ ?
140 &msg->ibm_u.get.ibgm_rd :
141 &msg->ibm_u.putack.ibpam_rd;
142
143 if (flip) {
144 __swab32s(&rd->rd_key);
145 __swab32s(&rd->rd_nfrags);
146 }
147
148 n = rd->rd_nfrags;
149
James Simmons8d9de3f2016-06-10 16:13:39 -0400150 nob = offsetof(struct kib_msg, ibm_u) +
Peng Taod7e09d02013-05-02 16:46:55 +0800151 kiblnd_rd_msg_size(rd, msg->ibm_type, n);
152
153 if (msg->ibm_nob < nob) {
154 CERROR("Short %s: %d(%d)\n",
155 kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob);
156 return 1;
157 }
158
James Simmonsbbc2d822016-08-24 11:11:58 -0400159 msg_size = kiblnd_rd_size(rd);
160 if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) {
161 CERROR("Bad msg_size: %d, should be 0 < n <= %d\n",
162 msg_size, LNET_MAX_PAYLOAD);
163 return 1;
164 }
165
Peng Taod7e09d02013-05-02 16:46:55 +0800166 if (!flip)
167 return 0;
168
169 for (i = 0; i < n; i++) {
170 __swab32s(&rd->rd_frags[i].rf_nob);
171 __swab64s(&rd->rd_frags[i].rf_addr);
172 }
173
174 return 0;
175}
176
James Simmons8d9de3f2016-06-10 16:13:39 -0400177void kiblnd_pack_msg(lnet_ni_t *ni, struct kib_msg *msg, int version,
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200178 int credits, lnet_nid_t dstnid, __u64 dststamp)
Peng Taod7e09d02013-05-02 16:46:55 +0800179{
James Simmons8d9de3f2016-06-10 16:13:39 -0400180 struct kib_net *net = ni->ni_data;
Peng Taod7e09d02013-05-02 16:46:55 +0800181
James Simmons4420cfd2016-02-12 12:06:00 -0500182 /*
183 * CAVEAT EMPTOR! all message fields not set here should have been
184 * initialised previously.
185 */
Peng Taod7e09d02013-05-02 16:46:55 +0800186 msg->ibm_magic = IBLND_MSG_MAGIC;
187 msg->ibm_version = version;
188 /* ibm_type */
189 msg->ibm_credits = credits;
190 /* ibm_nob */
191 msg->ibm_cksum = 0;
192 msg->ibm_srcnid = ni->ni_nid;
193 msg->ibm_srcstamp = net->ibn_incarnation;
194 msg->ibm_dstnid = dstnid;
195 msg->ibm_dststamp = dststamp;
196
197 if (*kiblnd_tunables.kib_cksum) {
198 /* NB ibm_cksum zero while computing cksum */
199 msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
200 }
201}
202
James Simmons8d9de3f2016-06-10 16:13:39 -0400203int kiblnd_unpack_msg(struct kib_msg *msg, int nob)
Peng Taod7e09d02013-05-02 16:46:55 +0800204{
James Simmons8d9de3f2016-06-10 16:13:39 -0400205 const int hdr_size = offsetof(struct kib_msg, ibm_u);
Mike Shueyec3d17c2015-05-19 10:14:36 -0400206 __u32 msg_cksum;
207 __u16 version;
208 int msg_nob;
209 int flip;
Peng Taod7e09d02013-05-02 16:46:55 +0800210
211 /* 6 bytes are enough to have received magic + version */
212 if (nob < 6) {
213 CERROR("Short message: %d\n", nob);
214 return -EPROTO;
215 }
216
217 if (msg->ibm_magic == IBLND_MSG_MAGIC) {
218 flip = 0;
219 } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
220 flip = 1;
221 } else {
222 CERROR("Bad magic: %08x\n", msg->ibm_magic);
223 return -EPROTO;
224 }
225
226 version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
227 if (version != IBLND_MSG_VERSION &&
228 version != IBLND_MSG_VERSION_1) {
229 CERROR("Bad version: %x\n", version);
230 return -EPROTO;
231 }
232
233 if (nob < hdr_size) {
234 CERROR("Short message: %d\n", nob);
235 return -EPROTO;
236 }
237
238 msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
239 if (msg_nob > nob) {
240 CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
241 return -EPROTO;
242 }
243
James Simmons4420cfd2016-02-12 12:06:00 -0500244 /*
245 * checksum must be computed with ibm_cksum zero and BEFORE anything
246 * gets flipped
247 */
Peng Taod7e09d02013-05-02 16:46:55 +0800248 msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
249 msg->ibm_cksum = 0;
James Simmons5fd88332016-02-12 12:06:09 -0500250 if (msg_cksum &&
Peng Taod7e09d02013-05-02 16:46:55 +0800251 msg_cksum != kiblnd_cksum(msg, msg_nob)) {
252 CERROR("Bad checksum\n");
253 return -EPROTO;
254 }
255
256 msg->ibm_cksum = msg_cksum;
257
258 if (flip) {
259 /* leave magic unflipped as a clue to peer endianness */
260 msg->ibm_version = version;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200261 CLASSERT(sizeof(msg->ibm_type) == 1);
262 CLASSERT(sizeof(msg->ibm_credits) == 1);
Peng Taod7e09d02013-05-02 16:46:55 +0800263 msg->ibm_nob = msg_nob;
264 __swab64s(&msg->ibm_srcnid);
265 __swab64s(&msg->ibm_srcstamp);
266 __swab64s(&msg->ibm_dstnid);
267 __swab64s(&msg->ibm_dststamp);
268 }
269
270 if (msg->ibm_srcnid == LNET_NID_ANY) {
271 CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
272 return -EPROTO;
273 }
274
275 if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) {
276 CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type),
277 msg_nob, kiblnd_msgtype2size(msg->ibm_type));
278 return -EPROTO;
279 }
280
281 switch (msg->ibm_type) {
282 default:
283 CERROR("Unknown message type %x\n", msg->ibm_type);
284 return -EPROTO;
285
286 case IBLND_MSG_NOOP:
287 case IBLND_MSG_IMMEDIATE:
288 case IBLND_MSG_PUT_REQ:
289 break;
290
291 case IBLND_MSG_PUT_ACK:
292 case IBLND_MSG_GET_REQ:
293 if (kiblnd_unpack_rd(msg, flip))
294 return -EPROTO;
295 break;
296
297 case IBLND_MSG_PUT_NAK:
298 case IBLND_MSG_PUT_DONE:
299 case IBLND_MSG_GET_DONE:
300 if (flip)
301 __swab32s(&msg->ibm_u.completion.ibcm_status);
302 break;
303
304 case IBLND_MSG_CONNREQ:
305 case IBLND_MSG_CONNACK:
306 if (flip) {
307 __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
308 __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
309 __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
310 }
311 break;
312 }
313 return 0;
314}
315
James Simmons8d9de3f2016-06-10 16:13:39 -0400316int kiblnd_create_peer(lnet_ni_t *ni, struct kib_peer **peerp, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800317{
James Simmons8d9de3f2016-06-10 16:13:39 -0400318 struct kib_peer *peer;
319 struct kib_net *net = ni->ni_data;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400320 int cpt = lnet_cpt_of_nid(nid);
321 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800322
James Simmons06ace262016-02-12 12:06:08 -0500323 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +0800324 LASSERT(nid != LNET_NID_ANY);
325
326 LIBCFS_CPT_ALLOC(peer, lnet_cpt_table(), cpt, sizeof(*peer));
James Simmons06ace262016-02-12 12:06:08 -0500327 if (!peer) {
Peng Taod7e09d02013-05-02 16:46:55 +0800328 CERROR("Cannot allocate peer\n");
329 return -ENOMEM;
330 }
331
Peng Taod7e09d02013-05-02 16:46:55 +0800332 peer->ibp_ni = ni;
333 peer->ibp_nid = nid;
334 peer->ibp_error = 0;
335 peer->ibp_last_alive = 0;
Amir Shehata9e7d5bf2016-05-06 21:30:25 -0400336 peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
Amir Shehataf6e50062016-05-06 21:30:27 -0400337 peer->ibp_queue_depth = ni->ni_peertxcredits;
Peng Taod7e09d02013-05-02 16:46:55 +0800338 atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
339
340 INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
341 INIT_LIST_HEAD(&peer->ibp_conns);
342 INIT_LIST_HEAD(&peer->ibp_tx_queue);
343
344 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
345
346 /* always called with a ref on ni, which prevents ni being shutdown */
James Simmons5fd88332016-02-12 12:06:09 -0500347 LASSERT(!net->ibn_shutdown);
Peng Taod7e09d02013-05-02 16:46:55 +0800348
349 /* npeers only grows with the global lock held */
350 atomic_inc(&net->ibn_npeers);
351
352 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
353
354 *peerp = peer;
355 return 0;
356}
357
James Simmons8d9de3f2016-06-10 16:13:39 -0400358void kiblnd_destroy_peer(struct kib_peer *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800359{
James Simmons8d9de3f2016-06-10 16:13:39 -0400360 struct kib_net *net = peer->ibp_ni->ni_data;
Peng Taod7e09d02013-05-02 16:46:55 +0800361
James Simmons06ace262016-02-12 12:06:08 -0500362 LASSERT(net);
James Simmons5fd88332016-02-12 12:06:09 -0500363 LASSERT(!atomic_read(&peer->ibp_refcount));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200364 LASSERT(!kiblnd_peer_active(peer));
Liang Zhen4d99b252016-03-02 18:53:29 -0500365 LASSERT(kiblnd_peer_idle(peer));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200366 LASSERT(list_empty(&peer->ibp_tx_queue));
Peng Taod7e09d02013-05-02 16:46:55 +0800367
368 LIBCFS_FREE(peer, sizeof(*peer));
369
James Simmons4420cfd2016-02-12 12:06:00 -0500370 /*
371 * NB a peer's connections keep a reference on their peer until
Peng Taod7e09d02013-05-02 16:46:55 +0800372 * they are destroyed, so we can be assured that _all_ state to do
373 * with this peer has been cleaned up when its refcount drops to
James Simmons4420cfd2016-02-12 12:06:00 -0500374 * zero.
375 */
Peng Taod7e09d02013-05-02 16:46:55 +0800376 atomic_dec(&net->ibn_npeers);
377}
378
James Simmons8d9de3f2016-06-10 16:13:39 -0400379struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800380{
James Simmons4420cfd2016-02-12 12:06:00 -0500381 /*
382 * the caller is responsible for accounting the additional reference
383 * that this creates
384 */
Mike Shueyec3d17c2015-05-19 10:14:36 -0400385 struct list_head *peer_list = kiblnd_nid2peerlist(nid);
386 struct list_head *tmp;
James Simmons8d9de3f2016-06-10 16:13:39 -0400387 struct kib_peer *peer;
Peng Taod7e09d02013-05-02 16:46:55 +0800388
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200389 list_for_each(tmp, peer_list) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400390 peer = list_entry(tmp, struct kib_peer, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500391 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800392
393 if (peer->ibp_nid != nid)
394 continue;
395
396 CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n",
397 peer, libcfs_nid2str(nid),
398 atomic_read(&peer->ibp_refcount),
399 peer->ibp_version);
400 return peer;
401 }
402 return NULL;
403}
404
James Simmons8d9de3f2016-06-10 16:13:39 -0400405void kiblnd_unlink_peer_locked(struct kib_peer *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800406{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200407 LASSERT(list_empty(&peer->ibp_conns));
Peng Taod7e09d02013-05-02 16:46:55 +0800408
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200409 LASSERT(kiblnd_peer_active(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800410 list_del_init(&peer->ibp_list);
411 /* lose peerlist's ref */
412 kiblnd_peer_decref(peer);
413}
414
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200415static int kiblnd_get_peer_info(lnet_ni_t *ni, int index,
416 lnet_nid_t *nidp, int *count)
Peng Taod7e09d02013-05-02 16:46:55 +0800417{
James Simmons8d9de3f2016-06-10 16:13:39 -0400418 struct kib_peer *peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400419 struct list_head *ptmp;
420 int i;
421 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800422
423 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
424
425 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200426 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400427 peer = list_entry(ptmp, struct kib_peer, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500428 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800429
430 if (peer->ibp_ni != ni)
431 continue;
432
433 if (index-- > 0)
434 continue;
435
436 *nidp = peer->ibp_nid;
437 *count = atomic_read(&peer->ibp_refcount);
438
439 read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
440 flags);
441 return 0;
442 }
443 }
444
445 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
446 return -ENOENT;
447}
448
James Simmons8d9de3f2016-06-10 16:13:39 -0400449static void kiblnd_del_peer_locked(struct kib_peer *peer)
Peng Taod7e09d02013-05-02 16:46:55 +0800450{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400451 struct list_head *ctmp;
452 struct list_head *cnxt;
James Simmons8d9de3f2016-06-10 16:13:39 -0400453 struct kib_conn *conn;
Peng Taod7e09d02013-05-02 16:46:55 +0800454
455 if (list_empty(&peer->ibp_conns)) {
456 kiblnd_unlink_peer_locked(peer);
457 } else {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200458 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400459 conn = list_entry(ctmp, struct kib_conn, ibc_list);
Peng Taod7e09d02013-05-02 16:46:55 +0800460
461 kiblnd_close_conn_locked(conn, 0);
462 }
463 /* NB closing peer's last conn unlinked it. */
464 }
James Simmons4420cfd2016-02-12 12:06:00 -0500465 /*
466 * NB peer now unlinked; might even be freed if the peer table had the
467 * last ref on it.
468 */
Peng Taod7e09d02013-05-02 16:46:55 +0800469}
470
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200471static int kiblnd_del_peer(lnet_ni_t *ni, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800472{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200473 LIST_HEAD(zombies);
Mike Shueyec3d17c2015-05-19 10:14:36 -0400474 struct list_head *ptmp;
475 struct list_head *pnxt;
James Simmons8d9de3f2016-06-10 16:13:39 -0400476 struct kib_peer *peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400477 int lo;
478 int hi;
479 int i;
480 unsigned long flags;
481 int rc = -ENOENT;
Peng Taod7e09d02013-05-02 16:46:55 +0800482
483 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
484
485 if (nid != LNET_NID_ANY) {
James Simmonsd3d3d372016-02-12 12:06:05 -0500486 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
487 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
Peng Taod7e09d02013-05-02 16:46:55 +0800488 } else {
489 lo = 0;
490 hi = kiblnd_data.kib_peer_hash_size - 1;
491 }
492
493 for (i = lo; i <= hi; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200494 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400495 peer = list_entry(ptmp, struct kib_peer, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500496 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800497
498 if (peer->ibp_ni != ni)
499 continue;
500
501 if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
502 continue;
503
504 if (!list_empty(&peer->ibp_tx_queue)) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200505 LASSERT(list_empty(&peer->ibp_conns));
Peng Taod7e09d02013-05-02 16:46:55 +0800506
507 list_splice_init(&peer->ibp_tx_queue,
James Simmonsc314c312016-02-12 12:06:01 -0500508 &zombies);
Peng Taod7e09d02013-05-02 16:46:55 +0800509 }
510
511 kiblnd_del_peer_locked(peer);
512 rc = 0; /* matched something */
513 }
514 }
515
516 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
517
518 kiblnd_txlist_done(ni, &zombies, -EIO);
519
520 return rc;
521}
522
James Simmons8d9de3f2016-06-10 16:13:39 -0400523static struct kib_conn *kiblnd_get_conn_by_idx(lnet_ni_t *ni, int index)
Peng Taod7e09d02013-05-02 16:46:55 +0800524{
James Simmons8d9de3f2016-06-10 16:13:39 -0400525 struct kib_peer *peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400526 struct list_head *ptmp;
James Simmons8d9de3f2016-06-10 16:13:39 -0400527 struct kib_conn *conn;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400528 struct list_head *ctmp;
529 int i;
530 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +0800531
532 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
533
534 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200535 list_for_each(ptmp, &kiblnd_data.kib_peers[i]) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400536 peer = list_entry(ptmp, struct kib_peer, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500537 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800538
539 if (peer->ibp_ni != ni)
540 continue;
541
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200542 list_for_each(ctmp, &peer->ibp_conns) {
Peng Taod7e09d02013-05-02 16:46:55 +0800543 if (index-- > 0)
544 continue;
545
James Simmons8d9de3f2016-06-10 16:13:39 -0400546 conn = list_entry(ctmp, struct kib_conn,
James Simmonsc314c312016-02-12 12:06:01 -0500547 ibc_list);
Peng Taod7e09d02013-05-02 16:46:55 +0800548 kiblnd_conn_addref(conn);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +0200549 read_unlock_irqrestore(
550 &kiblnd_data.kib_global_lock,
551 flags);
Peng Taod7e09d02013-05-02 16:46:55 +0800552 return conn;
553 }
554 }
555 }
556
557 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
558 return NULL;
559}
560
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200561int kiblnd_translate_mtu(int value)
Peng Taod7e09d02013-05-02 16:46:55 +0800562{
563 switch (value) {
564 default:
565 return -1;
566 case 0:
567 return 0;
568 case 256:
569 return IB_MTU_256;
570 case 512:
571 return IB_MTU_512;
572 case 1024:
573 return IB_MTU_1024;
574 case 2048:
575 return IB_MTU_2048;
576 case 4096:
577 return IB_MTU_4096;
578 }
579}
580
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200581static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid)
Peng Taod7e09d02013-05-02 16:46:55 +0800582{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400583 int mtu;
Peng Taod7e09d02013-05-02 16:46:55 +0800584
585 /* XXX There is no path record for iWARP, set by netdev->change_mtu? */
James Simmons06ace262016-02-12 12:06:08 -0500586 if (!cmid->route.path_rec)
Peng Taod7e09d02013-05-02 16:46:55 +0800587 return;
588
589 mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200590 LASSERT(mtu >= 0);
James Simmons5fd88332016-02-12 12:06:09 -0500591 if (mtu)
Peng Taod7e09d02013-05-02 16:46:55 +0800592 cmid->route.path_rec->mtu = mtu;
593}
594
James Simmons8d9de3f2016-06-10 16:13:39 -0400595static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt)
Peng Taod7e09d02013-05-02 16:46:55 +0800596{
Mike Shueyec3d17c2015-05-19 10:14:36 -0400597 cpumask_t *mask;
598 int vectors;
599 int off;
600 int i;
601 lnet_nid_t nid = conn->ibc_peer->ibp_nid;
Peng Taod7e09d02013-05-02 16:46:55 +0800602
603 vectors = conn->ibc_cmid->device->num_comp_vectors;
604 if (vectors <= 1)
605 return 0;
606
607 mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt);
James Simmons06ace262016-02-12 12:06:08 -0500608 if (!mask)
Peng Tao3867ea52013-07-15 22:27:10 +0800609 return 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800610
611 /* hash NID to CPU id in this partition... */
Oleg Drokin4a316f72015-03-07 19:24:27 -0500612 off = do_div(nid, cpumask_weight(mask));
613 for_each_cpu(i, mask) {
James Simmons5fd88332016-02-12 12:06:09 -0500614 if (!off--)
Peng Taod7e09d02013-05-02 16:46:55 +0800615 return i % vectors;
616 }
617
618 LBUG();
619 return 1;
620}
621
James Simmons8d9de3f2016-06-10 16:13:39 -0400622struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid,
Oleg Drokin24c198e2016-08-21 18:04:35 -0400623 int state, int version)
Peng Taod7e09d02013-05-02 16:46:55 +0800624{
James Simmons4420cfd2016-02-12 12:06:00 -0500625 /*
626 * CAVEAT EMPTOR:
Peng Taod7e09d02013-05-02 16:46:55 +0800627 * If the new conn is created successfully it takes over the caller's
628 * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself
629 * is destroyed. On failure, the caller's ref on 'peer' remains and
630 * she must dispose of 'cmid'. (Actually I'd block forever if I tried
631 * to destroy 'cmid' here since I'm called from the CM which still has
James Simmons4420cfd2016-02-12 12:06:00 -0500632 * its ref on 'cmid').
633 */
Mike Shueyec3d17c2015-05-19 10:14:36 -0400634 rwlock_t *glock = &kiblnd_data.kib_global_lock;
James Simmons8d9de3f2016-06-10 16:13:39 -0400635 struct kib_net *net = peer->ibp_ni->ni_data;
636 struct kib_dev *dev;
Peng Taod7e09d02013-05-02 16:46:55 +0800637 struct ib_qp_init_attr *init_qp_attr;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400638 struct kib_sched_info *sched;
Linus Torvalds23908db2015-06-26 15:46:08 -0700639 struct ib_cq_init_attr cq_attr = {};
James Simmons8d9de3f2016-06-10 16:13:39 -0400640 struct kib_conn *conn;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400641 struct ib_cq *cq;
642 unsigned long flags;
643 int cpt;
644 int rc;
645 int i;
Peng Taod7e09d02013-05-02 16:46:55 +0800646
James Simmons06ace262016-02-12 12:06:08 -0500647 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +0800648 LASSERT(!in_interrupt());
649
650 dev = net->ibn_dev;
651
652 cpt = lnet_cpt_of_nid(peer->ibp_nid);
653 sched = kiblnd_data.kib_scheds[cpt];
654
655 LASSERT(sched->ibs_nthreads > 0);
656
657 LIBCFS_CPT_ALLOC(init_qp_attr, lnet_cpt_table(), cpt,
658 sizeof(*init_qp_attr));
James Simmons06ace262016-02-12 12:06:08 -0500659 if (!init_qp_attr) {
Peng Taod7e09d02013-05-02 16:46:55 +0800660 CERROR("Can't allocate qp_attr for %s\n",
661 libcfs_nid2str(peer->ibp_nid));
662 goto failed_0;
663 }
664
665 LIBCFS_CPT_ALLOC(conn, lnet_cpt_table(), cpt, sizeof(*conn));
James Simmons06ace262016-02-12 12:06:08 -0500666 if (!conn) {
Peng Taod7e09d02013-05-02 16:46:55 +0800667 CERROR("Can't allocate connection for %s\n",
668 libcfs_nid2str(peer->ibp_nid));
669 goto failed_1;
670 }
671
672 conn->ibc_state = IBLND_CONN_INIT;
673 conn->ibc_version = version;
674 conn->ibc_peer = peer; /* I take the caller's ref */
675 cmid->context = conn; /* for future CM callbacks */
676 conn->ibc_cmid = cmid;
Amir Shehataa01fa102016-03-02 18:53:25 -0500677 conn->ibc_max_frags = peer->ibp_max_frags;
678 conn->ibc_queue_depth = peer->ibp_queue_depth;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500679
Peng Taod7e09d02013-05-02 16:46:55 +0800680 INIT_LIST_HEAD(&conn->ibc_early_rxs);
681 INIT_LIST_HEAD(&conn->ibc_tx_noops);
682 INIT_LIST_HEAD(&conn->ibc_tx_queue);
683 INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
684 INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
685 INIT_LIST_HEAD(&conn->ibc_active_txs);
686 spin_lock_init(&conn->ibc_lock);
687
688 LIBCFS_CPT_ALLOC(conn->ibc_connvars, lnet_cpt_table(), cpt,
689 sizeof(*conn->ibc_connvars));
James Simmons06ace262016-02-12 12:06:08 -0500690 if (!conn->ibc_connvars) {
Peng Taod7e09d02013-05-02 16:46:55 +0800691 CERROR("Can't allocate in-progress connection state\n");
692 goto failed_2;
693 }
694
695 write_lock_irqsave(glock, flags);
696 if (dev->ibd_failover) {
697 write_unlock_irqrestore(glock, flags);
698 CERROR("%s: failover in progress\n", dev->ibd_ifname);
699 goto failed_2;
700 }
701
702 if (dev->ibd_hdev->ibh_ibdev != cmid->device) {
703 /* wakeup failover thread and teardown connection */
704 if (kiblnd_dev_can_failover(dev)) {
705 list_add_tail(&dev->ibd_fail_list,
706 &kiblnd_data.kib_failed_devs);
707 wake_up(&kiblnd_data.kib_failover_waitq);
708 }
709
710 write_unlock_irqrestore(glock, flags);
711 CERROR("cmid HCA(%s), kib_dev(%s) need failover\n",
712 cmid->device->name, dev->ibd_ifname);
713 goto failed_2;
714 }
715
716 kiblnd_hdev_addref_locked(dev->ibd_hdev);
717 conn->ibc_hdev = dev->ibd_hdev;
718
719 kiblnd_setup_mtu_locked(cmid);
720
721 write_unlock_irqrestore(glock, flags);
722
723 LIBCFS_CPT_ALLOC(conn->ibc_rxs, lnet_cpt_table(), cpt,
James Simmons8d9de3f2016-06-10 16:13:39 -0400724 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
James Simmons06ace262016-02-12 12:06:08 -0500725 if (!conn->ibc_rxs) {
Peng Taod7e09d02013-05-02 16:46:55 +0800726 CERROR("Cannot allocate RX buffers\n");
727 goto failed_2;
728 }
729
730 rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500731 IBLND_RX_MSG_PAGES(conn));
James Simmons5fd88332016-02-12 12:06:09 -0500732 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +0800733 goto failed_2;
734
735 kiblnd_map_rx_descs(conn);
736
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500737 cq_attr.cqe = IBLND_CQ_ENTRIES(conn);
Matan Barak8e372102015-06-11 16:35:21 +0300738 cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt);
Peng Taod7e09d02013-05-02 16:46:55 +0800739 cq = ib_create_cq(cmid->device,
740 kiblnd_cq_completion, kiblnd_cq_event, conn,
Matan Barak8e372102015-06-11 16:35:21 +0300741 &cq_attr);
Peng Taod7e09d02013-05-02 16:46:55 +0800742 if (IS_ERR(cq)) {
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500743 CERROR("Failed to create CQ with %d CQEs: %ld\n",
744 IBLND_CQ_ENTRIES(conn), PTR_ERR(cq));
Peng Taod7e09d02013-05-02 16:46:55 +0800745 goto failed_2;
746 }
747
748 conn->ibc_cq = cq;
749
750 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
James Simmons5fd88332016-02-12 12:06:09 -0500751 if (rc) {
Frank Zago9c379662016-03-02 17:02:02 -0500752 CERROR("Can't request completion notification: %d\n", rc);
Peng Taod7e09d02013-05-02 16:46:55 +0800753 goto failed_2;
754 }
755
756 init_qp_attr->event_handler = kiblnd_qp_event;
757 init_qp_attr->qp_context = conn;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500758 init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn);
759 init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn);
Peng Taod7e09d02013-05-02 16:46:55 +0800760 init_qp_attr->cap.max_send_sge = 1;
761 init_qp_attr->cap.max_recv_sge = 1;
762 init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
763 init_qp_attr->qp_type = IB_QPT_RC;
764 init_qp_attr->send_cq = cq;
765 init_qp_attr->recv_cq = cq;
766
767 conn->ibc_sched = sched;
768
769 rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr);
James Simmons5fd88332016-02-12 12:06:09 -0500770 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +0800771 CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n",
772 rc, init_qp_attr->cap.max_send_wr,
773 init_qp_attr->cap.max_recv_wr);
774 goto failed_2;
775 }
776
777 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
778
779 /* 1 ref for caller and each rxmsg */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500780 atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn));
781 conn->ibc_nrx = IBLND_RX_MSGS(conn);
Peng Taod7e09d02013-05-02 16:46:55 +0800782
783 /* post receives */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500784 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +0800785 rc = kiblnd_post_rx(&conn->ibc_rxs[i],
786 IBLND_POSTRX_NO_CREDIT);
James Simmons5fd88332016-02-12 12:06:09 -0500787 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +0800788 CERROR("Can't post rxmsg: %d\n", rc);
789
790 /* Make posted receives complete */
791 kiblnd_abort_receives(conn);
792
James Simmons4420cfd2016-02-12 12:06:00 -0500793 /*
794 * correct # of posted buffers
795 * NB locking needed now I'm racing with completion
796 */
Peng Taod7e09d02013-05-02 16:46:55 +0800797 spin_lock_irqsave(&sched->ibs_lock, flags);
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500798 conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i;
Peng Taod7e09d02013-05-02 16:46:55 +0800799 spin_unlock_irqrestore(&sched->ibs_lock, flags);
800
James Simmons4420cfd2016-02-12 12:06:00 -0500801 /*
802 * cmid will be destroyed by CM(ofed) after cm_callback
Peng Taod7e09d02013-05-02 16:46:55 +0800803 * returned, so we can't refer it anymore
James Simmons4420cfd2016-02-12 12:06:00 -0500804 * (by kiblnd_connd()->kiblnd_destroy_conn)
805 */
Peng Taod7e09d02013-05-02 16:46:55 +0800806 rdma_destroy_qp(conn->ibc_cmid);
807 conn->ibc_cmid = NULL;
808
809 /* Drop my own and unused rxbuffer refcounts */
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -0500810 while (i++ <= IBLND_RX_MSGS(conn))
Peng Taod7e09d02013-05-02 16:46:55 +0800811 kiblnd_conn_decref(conn);
812
813 return NULL;
814 }
815 }
816
817 /* Init successful! */
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200818 LASSERT(state == IBLND_CONN_ACTIVE_CONNECT ||
James Simmonsc314c312016-02-12 12:06:01 -0500819 state == IBLND_CONN_PASSIVE_WAIT);
Peng Taod7e09d02013-05-02 16:46:55 +0800820 conn->ibc_state = state;
821
822 /* 1 more conn */
823 atomic_inc(&net->ibn_nconns);
824 return conn;
825
826 failed_2:
Liang Zhen4d99b252016-03-02 18:53:29 -0500827 kiblnd_destroy_conn(conn, true);
Peng Taod7e09d02013-05-02 16:46:55 +0800828 failed_1:
829 LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
830 failed_0:
831 return NULL;
832}
833
James Simmons8d9de3f2016-06-10 16:13:39 -0400834void kiblnd_destroy_conn(struct kib_conn *conn, bool free_conn)
Peng Taod7e09d02013-05-02 16:46:55 +0800835{
836 struct rdma_cm_id *cmid = conn->ibc_cmid;
James Simmons8d9de3f2016-06-10 16:13:39 -0400837 struct kib_peer *peer = conn->ibc_peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400838 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +0800839
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200840 LASSERT(!in_interrupt());
James Simmons5fd88332016-02-12 12:06:09 -0500841 LASSERT(!atomic_read(&conn->ibc_refcount));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200842 LASSERT(list_empty(&conn->ibc_early_rxs));
843 LASSERT(list_empty(&conn->ibc_tx_noops));
844 LASSERT(list_empty(&conn->ibc_tx_queue));
845 LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd));
846 LASSERT(list_empty(&conn->ibc_tx_queue_nocred));
847 LASSERT(list_empty(&conn->ibc_active_txs));
James Simmons5fd88332016-02-12 12:06:09 -0500848 LASSERT(!conn->ibc_noops_posted);
849 LASSERT(!conn->ibc_nsends_posted);
Peng Taod7e09d02013-05-02 16:46:55 +0800850
851 switch (conn->ibc_state) {
852 default:
853 /* conn must be completely disengaged from the network */
854 LBUG();
855
856 case IBLND_CONN_DISCONNECTED:
857 /* connvars should have been freed already */
James Simmons06ace262016-02-12 12:06:08 -0500858 LASSERT(!conn->ibc_connvars);
Peng Taod7e09d02013-05-02 16:46:55 +0800859 break;
860
861 case IBLND_CONN_INIT:
862 break;
863 }
864
865 /* conn->ibc_cmid might be destroyed by CM already */
James Simmons06ace262016-02-12 12:06:08 -0500866 if (cmid && cmid->qp)
Peng Taod7e09d02013-05-02 16:46:55 +0800867 rdma_destroy_qp(cmid);
868
James Simmons06ace262016-02-12 12:06:08 -0500869 if (conn->ibc_cq) {
Peng Taod7e09d02013-05-02 16:46:55 +0800870 rc = ib_destroy_cq(conn->ibc_cq);
James Simmons5fd88332016-02-12 12:06:09 -0500871 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +0800872 CWARN("Error destroying CQ: %d\n", rc);
873 }
874
James Simmons06ace262016-02-12 12:06:08 -0500875 if (conn->ibc_rx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +0800876 kiblnd_unmap_rx_descs(conn);
877
James Simmons06ace262016-02-12 12:06:08 -0500878 if (conn->ibc_rxs) {
Peng Taod7e09d02013-05-02 16:46:55 +0800879 LIBCFS_FREE(conn->ibc_rxs,
James Simmons8d9de3f2016-06-10 16:13:39 -0400880 IBLND_RX_MSGS(conn) * sizeof(struct kib_rx));
Peng Taod7e09d02013-05-02 16:46:55 +0800881 }
882
James Simmons06ace262016-02-12 12:06:08 -0500883 if (conn->ibc_connvars)
Peng Taod7e09d02013-05-02 16:46:55 +0800884 LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
885
James Simmons06ace262016-02-12 12:06:08 -0500886 if (conn->ibc_hdev)
Peng Taod7e09d02013-05-02 16:46:55 +0800887 kiblnd_hdev_decref(conn->ibc_hdev);
888
889 /* See CAVEAT EMPTOR above in kiblnd_create_conn */
890 if (conn->ibc_state != IBLND_CONN_INIT) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400891 struct kib_net *net = peer->ibp_ni->ni_data;
Peng Taod7e09d02013-05-02 16:46:55 +0800892
893 kiblnd_peer_decref(peer);
894 rdma_destroy_id(cmid);
895 atomic_dec(&net->ibn_nconns);
896 }
897
898 LIBCFS_FREE(conn, sizeof(*conn));
899}
900
James Simmons8d9de3f2016-06-10 16:13:39 -0400901int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why)
Peng Taod7e09d02013-05-02 16:46:55 +0800902{
James Simmons8d9de3f2016-06-10 16:13:39 -0400903 struct kib_conn *conn;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400904 struct list_head *ctmp;
905 struct list_head *cnxt;
906 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800907
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200908 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400909 conn = list_entry(ctmp, struct kib_conn, ibc_list);
Peng Taod7e09d02013-05-02 16:46:55 +0800910
Joe Perches2d00bd12014-11-23 11:28:50 -0800911 CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800912 libcfs_nid2str(peer->ibp_nid),
913 conn->ibc_version, why);
914
915 kiblnd_close_conn_locked(conn, why);
916 count++;
917 }
918
919 return count;
920}
921
James Simmons8d9de3f2016-06-10 16:13:39 -0400922int kiblnd_close_stale_conns_locked(struct kib_peer *peer,
James Simmonsc314c312016-02-12 12:06:01 -0500923 int version, __u64 incarnation)
Peng Taod7e09d02013-05-02 16:46:55 +0800924{
James Simmons8d9de3f2016-06-10 16:13:39 -0400925 struct kib_conn *conn;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400926 struct list_head *ctmp;
927 struct list_head *cnxt;
928 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800929
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200930 list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400931 conn = list_entry(ctmp, struct kib_conn, ibc_list);
Peng Taod7e09d02013-05-02 16:46:55 +0800932
933 if (conn->ibc_version == version &&
934 conn->ibc_incarnation == incarnation)
935 continue;
936
Guillaume Matheron7a3888a2015-04-02 19:52:07 +0200937 CDEBUG(D_NET,
938 "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800939 libcfs_nid2str(peer->ibp_nid),
940 conn->ibc_version, conn->ibc_incarnation,
941 version, incarnation);
942
943 kiblnd_close_conn_locked(conn, -ESTALE);
944 count++;
945 }
946
947 return count;
948}
949
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200950static int kiblnd_close_matching_conns(lnet_ni_t *ni, lnet_nid_t nid)
Peng Taod7e09d02013-05-02 16:46:55 +0800951{
James Simmons8d9de3f2016-06-10 16:13:39 -0400952 struct kib_peer *peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400953 struct list_head *ptmp;
954 struct list_head *pnxt;
955 int lo;
956 int hi;
957 int i;
958 unsigned long flags;
959 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800960
961 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
962
James Simmonsd3d3d372016-02-12 12:06:05 -0500963 if (nid != LNET_NID_ANY) {
964 lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
965 hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
966 } else {
Peng Taod7e09d02013-05-02 16:46:55 +0800967 lo = 0;
968 hi = kiblnd_data.kib_peer_hash_size - 1;
969 }
970
971 for (i = lo; i <= hi; i++) {
Guillaume Matheronfebe73b2015-04-02 19:35:45 +0200972 list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
James Simmons8d9de3f2016-06-10 16:13:39 -0400973 peer = list_entry(ptmp, struct kib_peer, ibp_list);
Liang Zhen4d99b252016-03-02 18:53:29 -0500974 LASSERT(!kiblnd_peer_idle(peer));
Peng Taod7e09d02013-05-02 16:46:55 +0800975
976 if (peer->ibp_ni != ni)
977 continue;
978
979 if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
980 continue;
981
982 count += kiblnd_close_peer_conns_locked(peer, 0);
983 }
984 }
985
986 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
987
988 /* wildcards always succeed */
989 if (nid == LNET_NID_ANY)
990 return 0;
991
James Simmons5fd88332016-02-12 12:06:09 -0500992 return !count ? -ENOENT : 0;
Peng Taod7e09d02013-05-02 16:46:55 +0800993}
994
Frank Zago439b4d42016-03-02 17:02:00 -0500995static int kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
Peng Taod7e09d02013-05-02 16:46:55 +0800996{
997 struct libcfs_ioctl_data *data = arg;
Mike Shueyec3d17c2015-05-19 10:14:36 -0400998 int rc = -EINVAL;
Peng Taod7e09d02013-05-02 16:46:55 +0800999
Greg Donalda58a38a2014-08-21 12:40:35 -05001000 switch (cmd) {
Peng Taod7e09d02013-05-02 16:46:55 +08001001 case IOC_LIBCFS_GET_PEER: {
Mike Shueyec3d17c2015-05-19 10:14:36 -04001002 lnet_nid_t nid = 0;
1003 int count = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001004
1005 rc = kiblnd_get_peer_info(ni, data->ioc_count,
1006 &nid, &count);
Mike Shueyec3d17c2015-05-19 10:14:36 -04001007 data->ioc_nid = nid;
1008 data->ioc_count = count;
Peng Taod7e09d02013-05-02 16:46:55 +08001009 break;
1010 }
1011
1012 case IOC_LIBCFS_DEL_PEER: {
1013 rc = kiblnd_del_peer(ni, data->ioc_nid);
1014 break;
1015 }
1016 case IOC_LIBCFS_GET_CONN: {
James Simmons8d9de3f2016-06-10 16:13:39 -04001017 struct kib_conn *conn;
Peng Taod7e09d02013-05-02 16:46:55 +08001018
1019 rc = 0;
1020 conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
James Simmons06ace262016-02-12 12:06:08 -05001021 if (!conn) {
Peng Taod7e09d02013-05-02 16:46:55 +08001022 rc = -ENOENT;
1023 break;
1024 }
1025
James Simmons06ace262016-02-12 12:06:08 -05001026 LASSERT(conn->ibc_cmid);
Peng Taod7e09d02013-05-02 16:46:55 +08001027 data->ioc_nid = conn->ibc_peer->ibp_nid;
James Simmons06ace262016-02-12 12:06:08 -05001028 if (!conn->ibc_cmid->route.path_rec)
Peng Taod7e09d02013-05-02 16:46:55 +08001029 data->ioc_u32[0] = 0; /* iWarp has no path MTU */
1030 else
1031 data->ioc_u32[0] =
1032 ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu);
1033 kiblnd_conn_decref(conn);
1034 break;
1035 }
1036 case IOC_LIBCFS_CLOSE_CONNECTION: {
1037 rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
1038 break;
1039 }
1040
1041 default:
1042 break;
1043 }
1044
1045 return rc;
1046}
1047
Frank Zago439b4d42016-03-02 17:02:00 -05001048static void kiblnd_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when)
Peng Taod7e09d02013-05-02 16:46:55 +08001049{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001050 unsigned long last_alive = 0;
1051 unsigned long now = cfs_time_current();
1052 rwlock_t *glock = &kiblnd_data.kib_global_lock;
James Simmons8d9de3f2016-06-10 16:13:39 -04001053 struct kib_peer *peer;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001054 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +08001055
1056 read_lock_irqsave(glock, flags);
1057
1058 peer = kiblnd_find_peer_locked(nid);
Liang Zhen4d99b252016-03-02 18:53:29 -05001059 if (peer)
Peng Taod7e09d02013-05-02 16:46:55 +08001060 last_alive = peer->ibp_last_alive;
Peng Taod7e09d02013-05-02 16:46:55 +08001061
1062 read_unlock_irqrestore(glock, flags);
1063
James Simmons5fd88332016-02-12 12:06:09 -05001064 if (last_alive)
Peng Taod7e09d02013-05-02 16:46:55 +08001065 *when = last_alive;
1066
James Simmons4420cfd2016-02-12 12:06:00 -05001067 /*
1068 * peer is not persistent in hash, trigger peer creation
1069 * and connection establishment with a NULL tx
1070 */
James Simmons06ace262016-02-12 12:06:08 -05001071 if (!peer)
Peng Taod7e09d02013-05-02 16:46:55 +08001072 kiblnd_launch_tx(ni, NULL, nid);
1073
1074 CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n",
1075 libcfs_nid2str(nid), peer,
1076 last_alive ? cfs_duration_sec(now - last_alive) : -1);
Peng Taod7e09d02013-05-02 16:46:55 +08001077}
1078
James Simmons8d9de3f2016-06-10 16:13:39 -04001079static void kiblnd_free_pages(struct kib_pages *p)
Peng Taod7e09d02013-05-02 16:46:55 +08001080{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001081 int npages = p->ibp_npages;
1082 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001083
1084 for (i = 0; i < npages; i++) {
James Simmons06ace262016-02-12 12:06:08 -05001085 if (p->ibp_pages[i])
Peng Taod7e09d02013-05-02 16:46:55 +08001086 __free_page(p->ibp_pages[i]);
1087 }
1088
James Simmons8d9de3f2016-06-10 16:13:39 -04001089 LIBCFS_FREE(p, offsetof(struct kib_pages, ibp_pages[npages]));
Peng Taod7e09d02013-05-02 16:46:55 +08001090}
1091
James Simmons8d9de3f2016-06-10 16:13:39 -04001092int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages)
Peng Taod7e09d02013-05-02 16:46:55 +08001093{
James Simmons8d9de3f2016-06-10 16:13:39 -04001094 struct kib_pages *p;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001095 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001096
1097 LIBCFS_CPT_ALLOC(p, lnet_cpt_table(), cpt,
James Simmons8d9de3f2016-06-10 16:13:39 -04001098 offsetof(struct kib_pages, ibp_pages[npages]));
James Simmons06ace262016-02-12 12:06:08 -05001099 if (!p) {
Peng Taod7e09d02013-05-02 16:46:55 +08001100 CERROR("Can't allocate descriptor for %d pages\n", npages);
1101 return -ENOMEM;
1102 }
1103
James Simmons8d9de3f2016-06-10 16:13:39 -04001104 memset(p, 0, offsetof(struct kib_pages, ibp_pages[npages]));
Peng Taod7e09d02013-05-02 16:46:55 +08001105 p->ibp_npages = npages;
1106
1107 for (i = 0; i < npages; i++) {
Peng Tao49c02a72013-06-03 21:58:22 +08001108 p->ibp_pages[i] = alloc_pages_node(
1109 cfs_cpt_spread_node(lnet_cpt_table(), cpt),
Ann Koehler0be19af2014-04-27 13:06:36 -04001110 GFP_NOFS, 0);
James Simmons06ace262016-02-12 12:06:08 -05001111 if (!p->ibp_pages[i]) {
Peng Taod7e09d02013-05-02 16:46:55 +08001112 CERROR("Can't allocate page %d of %d\n", i, npages);
1113 kiblnd_free_pages(p);
1114 return -ENOMEM;
1115 }
1116 }
1117
1118 *pp = p;
1119 return 0;
1120}
1121
James Simmons8d9de3f2016-06-10 16:13:39 -04001122void kiblnd_unmap_rx_descs(struct kib_conn *conn)
Peng Taod7e09d02013-05-02 16:46:55 +08001123{
James Simmons8d9de3f2016-06-10 16:13:39 -04001124 struct kib_rx *rx;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001125 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001126
James Simmons06ace262016-02-12 12:06:08 -05001127 LASSERT(conn->ibc_rxs);
1128 LASSERT(conn->ibc_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08001129
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001130 for (i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +08001131 rx = &conn->ibc_rxs[i];
1132
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001133 LASSERT(rx->rx_nob >= 0); /* not posted */
Peng Taod7e09d02013-05-02 16:46:55 +08001134
1135 kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev,
1136 KIBLND_UNMAP_ADDR(rx, rx_msgunmap,
1137 rx->rx_msgaddr),
1138 IBLND_MSG_SIZE, DMA_FROM_DEVICE);
1139 }
1140
1141 kiblnd_free_pages(conn->ibc_rx_pages);
1142
1143 conn->ibc_rx_pages = NULL;
1144}
1145
James Simmons8d9de3f2016-06-10 16:13:39 -04001146void kiblnd_map_rx_descs(struct kib_conn *conn)
Peng Taod7e09d02013-05-02 16:46:55 +08001147{
James Simmons8d9de3f2016-06-10 16:13:39 -04001148 struct kib_rx *rx;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001149 struct page *pg;
1150 int pg_off;
1151 int ipg;
1152 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001153
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001154 for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) {
Peng Taod7e09d02013-05-02 16:46:55 +08001155 pg = conn->ibc_rx_pages->ibp_pages[ipg];
1156 rx = &conn->ibc_rxs[i];
1157
1158 rx->rx_conn = conn;
James Simmons8d9de3f2016-06-10 16:13:39 -04001159 rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off);
Peng Taod7e09d02013-05-02 16:46:55 +08001160
1161 rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev,
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02001162 rx->rx_msg,
1163 IBLND_MSG_SIZE,
Peng Taod7e09d02013-05-02 16:46:55 +08001164 DMA_FROM_DEVICE);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001165 LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev,
James Simmonsc314c312016-02-12 12:06:01 -05001166 rx->rx_msgaddr));
Peng Taod7e09d02013-05-02 16:46:55 +08001167 KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr);
1168
Greg Donald1d8cb702014-08-25 20:07:19 -05001169 CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n",
Peng Taod7e09d02013-05-02 16:46:55 +08001170 i, rx->rx_msg, rx->rx_msgaddr,
John L. Hammondd664d1f2015-06-11 15:18:08 -04001171 (__u64)(page_to_phys(pg) + pg_off));
Peng Taod7e09d02013-05-02 16:46:55 +08001172
1173 pg_off += IBLND_MSG_SIZE;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001174 LASSERT(pg_off <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001175
1176 if (pg_off == PAGE_SIZE) {
1177 pg_off = 0;
1178 ipg++;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001179 LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn));
Peng Taod7e09d02013-05-02 16:46:55 +08001180 }
1181 }
1182}
1183
James Simmons8d9de3f2016-06-10 16:13:39 -04001184static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001185{
James Simmons8d9de3f2016-06-10 16:13:39 -04001186 struct kib_hca_dev *hdev = tpo->tpo_hdev;
1187 struct kib_tx *tx;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001188 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001189
James Simmons5fd88332016-02-12 12:06:09 -05001190 LASSERT(!tpo->tpo_pool.po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001191
James Simmons06ace262016-02-12 12:06:08 -05001192 if (!hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08001193 return;
1194
1195 for (i = 0; i < tpo->tpo_pool.po_size; i++) {
1196 tx = &tpo->tpo_tx_descs[i];
1197 kiblnd_dma_unmap_single(hdev->ibh_ibdev,
1198 KIBLND_UNMAP_ADDR(tx, tx_msgunmap,
1199 tx->tx_msgaddr),
1200 IBLND_MSG_SIZE, DMA_TO_DEVICE);
1201 }
1202
1203 kiblnd_hdev_decref(hdev);
1204 tpo->tpo_hdev = NULL;
1205}
1206
James Simmons8d9de3f2016-06-10 16:13:39 -04001207static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08001208{
James Simmons8d9de3f2016-06-10 16:13:39 -04001209 struct kib_hca_dev *hdev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001210 unsigned long flags;
1211 int i = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001212
1213 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1214 while (dev->ibd_failover) {
1215 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
James Simmons5fd88332016-02-12 12:06:09 -05001216 if (!(i++ % 50))
Peng Taod7e09d02013-05-02 16:46:55 +08001217 CDEBUG(D_NET, "%s: Wait for failover\n",
1218 dev->ibd_ifname);
Liang Zhenea363b42016-03-02 18:53:30 -05001219 set_current_state(TASK_INTERRUPTIBLE);
Peng Taod7e09d02013-05-02 16:46:55 +08001220 schedule_timeout(cfs_time_seconds(1) / 100);
1221
1222 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
1223 }
1224
1225 kiblnd_hdev_addref_locked(dev->ibd_hdev);
1226 hdev = dev->ibd_hdev;
1227
1228 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
1229
1230 return hdev;
1231}
1232
James Simmons8d9de3f2016-06-10 16:13:39 -04001233static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001234{
James Simmons8d9de3f2016-06-10 16:13:39 -04001235 struct kib_pages *txpgs = tpo->tpo_tx_pages;
1236 struct kib_pool *pool = &tpo->tpo_pool;
1237 struct kib_net *net = pool->po_owner->ps_net;
1238 struct kib_dev *dev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001239 struct page *page;
James Simmons8d9de3f2016-06-10 16:13:39 -04001240 struct kib_tx *tx;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001241 int page_offset;
1242 int ipage;
1243 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001244
James Simmons06ace262016-02-12 12:06:08 -05001245 LASSERT(net);
Peng Taod7e09d02013-05-02 16:46:55 +08001246
1247 dev = net->ibn_dev;
1248
1249 /* pre-mapped messages are not bigger than 1 page */
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001250 CLASSERT(IBLND_MSG_SIZE <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001251
1252 /* No fancy arithmetic when we do the buffer calculations */
James Simmons5fd88332016-02-12 12:06:09 -05001253 CLASSERT(!(PAGE_SIZE % IBLND_MSG_SIZE));
Peng Taod7e09d02013-05-02 16:46:55 +08001254
1255 tpo->tpo_hdev = kiblnd_current_hdev(dev);
1256
1257 for (ipage = page_offset = i = 0; i < pool->po_size; i++) {
1258 page = txpgs->ibp_pages[ipage];
1259 tx = &tpo->tpo_tx_descs[i];
1260
James Simmons8d9de3f2016-06-10 16:13:39 -04001261 tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) +
Peng Taod7e09d02013-05-02 16:46:55 +08001262 page_offset);
1263
1264 tx->tx_msgaddr = kiblnd_dma_map_single(
1265 tpo->tpo_hdev->ibh_ibdev, tx->tx_msg,
1266 IBLND_MSG_SIZE, DMA_TO_DEVICE);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001267 LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev,
James Simmonsc314c312016-02-12 12:06:01 -05001268 tx->tx_msgaddr));
Peng Taod7e09d02013-05-02 16:46:55 +08001269 KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr);
1270
1271 list_add(&tx->tx_list, &pool->po_free_list);
1272
1273 page_offset += IBLND_MSG_SIZE;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001274 LASSERT(page_offset <= PAGE_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08001275
1276 if (page_offset == PAGE_SIZE) {
1277 page_offset = 0;
1278 ipage++;
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001279 LASSERT(ipage <= txpgs->ibp_npages);
Peng Taod7e09d02013-05-02 16:46:55 +08001280 }
1281 }
1282}
1283
James Simmons8d9de3f2016-06-10 16:13:39 -04001284struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, struct kib_rdma_desc *rd,
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001285 int negotiated_nfrags)
Peng Taod7e09d02013-05-02 16:46:55 +08001286{
James Simmons8d9de3f2016-06-10 16:13:39 -04001287 struct kib_net *net = ni->ni_data;
1288 struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev;
Amir Shehata32c8deb82016-05-06 21:30:28 -04001289 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
1290 __u16 nfrags;
1291 int mod;
1292
1293 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
1294 mod = tunables->lnd_map_on_demand;
1295 nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
Jeremy Filizetti2fb44f22016-03-02 18:53:24 -05001296
Amir Shehata7cadcc72016-03-02 17:02:03 -05001297 LASSERT(hdev->ibh_mrs);
Peng Taod7e09d02013-05-02 16:46:55 +08001298
Amir Shehata32c8deb82016-05-06 21:30:28 -04001299 if (mod > 0 && nfrags <= rd->rd_nfrags)
Peng Taod7e09d02013-05-02 16:46:55 +08001300 return NULL;
1301
Amir Shehata7cadcc72016-03-02 17:02:03 -05001302 return hdev->ibh_mrs;
Peng Taod7e09d02013-05-02 16:46:55 +08001303}
1304
James Simmons8d9de3f2016-06-10 16:13:39 -04001305static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001306{
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001307 LASSERT(!fpo->fpo_map_count);
Peng Taod7e09d02013-05-02 16:46:55 +08001308
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001309 if (fpo->fpo_is_fmr) {
1310 if (fpo->fmr.fpo_fmr_pool)
1311 ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
1312 } else {
1313 struct kib_fast_reg_descriptor *frd, *tmp;
1314 int i = 0;
Peng Taod7e09d02013-05-02 16:46:55 +08001315
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001316 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1317 frd_list) {
1318 list_del(&frd->frd_list);
1319 ib_dereg_mr(frd->frd_mr);
1320 LIBCFS_FREE(frd, sizeof(*frd));
1321 i++;
1322 }
1323 if (i < fpo->fast_reg.fpo_pool_size)
1324 CERROR("FastReg pool still has %d regions registered\n",
1325 fpo->fast_reg.fpo_pool_size - i);
1326 }
Peng Taod7e09d02013-05-02 16:46:55 +08001327
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001328 if (fpo->fpo_hdev)
1329 kiblnd_hdev_decref(fpo->fpo_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08001330
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001331 LIBCFS_FREE(fpo, sizeof(*fpo));
Peng Taod7e09d02013-05-02 16:46:55 +08001332}
1333
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001334static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
Peng Taod7e09d02013-05-02 16:46:55 +08001335{
James Simmons8d9de3f2016-06-10 16:13:39 -04001336 struct kib_fmr_pool *fpo, *tmp;
Peng Taod7e09d02013-05-02 16:46:55 +08001337
Dmitry Eremin0d33ec52016-05-05 14:53:01 -04001338 list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
Dmitry Eremin8daab0a2016-05-05 14:53:00 -04001339 list_del(&fpo->fpo_list);
1340 kiblnd_destroy_fmr_pool(fpo);
Peng Taod7e09d02013-05-02 16:46:55 +08001341 }
1342}
1343
Amir Shehata32c8deb82016-05-06 21:30:28 -04001344static int
1345kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1346 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08001347{
Amir Shehata32c8deb82016-05-06 21:30:28 -04001348 int size = tunables->lnd_fmr_pool_size / ncpts;
Peng Taod7e09d02013-05-02 16:46:55 +08001349
1350 return max(IBLND_FMR_POOL, size);
1351}
1352
Amir Shehata32c8deb82016-05-06 21:30:28 -04001353static int
1354kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
1355 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08001356{
Amir Shehata32c8deb82016-05-06 21:30:28 -04001357 int size = tunables->lnd_fmr_flush_trigger / ncpts;
Peng Taod7e09d02013-05-02 16:46:55 +08001358
1359 return max(IBLND_FMR_POOL_FLUSH, size);
1360}
1361
James Simmons8d9de3f2016-06-10 16:13:39 -04001362static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001363{
Peng Taod7e09d02013-05-02 16:46:55 +08001364 struct ib_fmr_pool_param param = {
James Simmons51078e22016-02-12 12:06:04 -05001365 .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
Mike Shueyec3d17c2015-05-19 10:14:36 -04001366 .page_shift = PAGE_SHIFT,
1367 .access = (IB_ACCESS_LOCAL_WRITE |
Miguel Bernabeu Diaze39f6ef2015-08-05 23:44:36 +02001368 IB_ACCESS_REMOTE_WRITE),
Mike Shueyec3d17c2015-05-19 10:14:36 -04001369 .pool_size = fps->fps_pool_size,
Peng Taod7e09d02013-05-02 16:46:55 +08001370 .dirty_watermark = fps->fps_flush_trigger,
1371 .flush_function = NULL,
Mike Shueyec3d17c2015-05-19 10:14:36 -04001372 .flush_arg = NULL,
Amir Shehata32c8deb82016-05-06 21:30:28 -04001373 .cache = !!fps->fps_cache };
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001374 int rc = 0;
1375
1376 fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
1377 &param);
1378 if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
1379 rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
1380 if (rc != -ENOSYS)
1381 CERROR("Failed to create FMR pool: %d\n", rc);
1382 else
1383 CERROR("FMRs are not supported\n");
1384 }
1385
1386 return rc;
1387}
1388
James Simmons8d9de3f2016-06-10 16:13:39 -04001389static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo)
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001390{
1391 struct kib_fast_reg_descriptor *frd, *tmp;
1392 int i, rc;
1393
1394 INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
1395 fpo->fast_reg.fpo_pool_size = 0;
1396 for (i = 0; i < fps->fps_pool_size; i++) {
1397 LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
1398 sizeof(*frd));
1399 if (!frd) {
1400 CERROR("Failed to allocate a new fast_reg descriptor\n");
1401 rc = -ENOMEM;
1402 goto out;
1403 }
1404
1405 frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
1406 IB_MR_TYPE_MEM_REG,
1407 LNET_MAX_PAYLOAD / PAGE_SIZE);
1408 if (IS_ERR(frd->frd_mr)) {
1409 rc = PTR_ERR(frd->frd_mr);
1410 CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
1411 frd->frd_mr = NULL;
1412 goto out_middle;
1413 }
1414
1415 frd->frd_valid = true;
1416
1417 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1418 fpo->fast_reg.fpo_pool_size++;
1419 }
1420
1421 return 0;
1422
1423out_middle:
1424 if (frd->frd_mr)
1425 ib_dereg_mr(frd->frd_mr);
1426 LIBCFS_FREE(frd, sizeof(*frd));
1427
1428out:
1429 list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
1430 frd_list) {
1431 list_del(&frd->frd_list);
1432 ib_dereg_mr(frd->frd_mr);
1433 LIBCFS_FREE(frd, sizeof(*frd));
1434 }
1435
1436 return rc;
1437}
1438
James Simmons8d9de3f2016-06-10 16:13:39 -04001439static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps,
1440 struct kib_fmr_pool **pp_fpo)
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001441{
James Simmons8d9de3f2016-06-10 16:13:39 -04001442 struct kib_dev *dev = fps->fps_net->ibn_dev;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001443 struct ib_device_attr *dev_attr;
James Simmons8d9de3f2016-06-10 16:13:39 -04001444 struct kib_fmr_pool *fpo;
Peng Taod7e09d02013-05-02 16:46:55 +08001445 int rc;
1446
1447 LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
James Simmons06ace262016-02-12 12:06:08 -05001448 if (!fpo)
Peng Taod7e09d02013-05-02 16:46:55 +08001449 return -ENOMEM;
1450
1451 fpo->fpo_hdev = kiblnd_current_hdev(dev);
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001452 dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
Peng Taod7e09d02013-05-02 16:46:55 +08001453
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001454 /* Check for FMR or FastReg support */
1455 fpo->fpo_is_fmr = 0;
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001456 if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
1457 fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
1458 fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
1459 fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
1460 LCONSOLE_INFO("Using FMR for registration\n");
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001461 fpo->fpo_is_fmr = 1;
1462 } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
1463 LCONSOLE_INFO("Using FastReg for registration\n");
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001464 } else {
1465 rc = -ENOSYS;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001466 LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001467 goto out_fpo;
Peng Taod7e09d02013-05-02 16:46:55 +08001468 }
1469
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001470 if (fpo->fpo_is_fmr)
1471 rc = kiblnd_alloc_fmr_pool(fps, fpo);
1472 else
1473 rc = kiblnd_alloc_freg_pool(fps, fpo);
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001474 if (rc)
1475 goto out_fpo;
1476
Peng Taod7e09d02013-05-02 16:46:55 +08001477 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001478 fpo->fpo_owner = fps;
Peng Taod7e09d02013-05-02 16:46:55 +08001479 *pp_fpo = fpo;
1480
1481 return 0;
Dmitry Ereminf66fb152016-05-05 14:53:03 -04001482
1483out_fpo:
1484 kiblnd_hdev_decref(fpo->fpo_hdev);
1485 LIBCFS_FREE(fpo, sizeof(*fpo));
1486 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001487}
1488
James Simmons8d9de3f2016-06-10 16:13:39 -04001489static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps,
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001490 struct list_head *zombies)
Peng Taod7e09d02013-05-02 16:46:55 +08001491{
James Simmons06ace262016-02-12 12:06:08 -05001492 if (!fps->fps_net) /* intialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001493 return;
1494
1495 spin_lock(&fps->fps_lock);
1496
1497 while (!list_empty(&fps->fps_pool_list)) {
James Simmons8d9de3f2016-06-10 16:13:39 -04001498 struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next,
1499 struct kib_fmr_pool, fpo_list);
Peng Taod7e09d02013-05-02 16:46:55 +08001500 fpo->fpo_failed = 1;
1501 list_del(&fpo->fpo_list);
James Simmons5fd88332016-02-12 12:06:09 -05001502 if (!fpo->fpo_map_count)
Peng Taod7e09d02013-05-02 16:46:55 +08001503 list_add(&fpo->fpo_list, zombies);
1504 else
1505 list_add(&fpo->fpo_list, &fps->fps_failed_pool_list);
1506 }
1507
1508 spin_unlock(&fps->fps_lock);
1509}
1510
James Simmons8d9de3f2016-06-10 16:13:39 -04001511static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps)
Peng Taod7e09d02013-05-02 16:46:55 +08001512{
James Simmons06ace262016-02-12 12:06:08 -05001513 if (fps->fps_net) { /* initialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001514 kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list);
1515 kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list);
1516 }
1517}
1518
Amir Shehata32c8deb82016-05-06 21:30:28 -04001519static int
James Simmons8d9de3f2016-06-10 16:13:39 -04001520kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts,
1521 struct kib_net *net,
Amir Shehata32c8deb82016-05-06 21:30:28 -04001522 struct lnet_ioctl_config_o2iblnd_tunables *tunables)
Peng Taod7e09d02013-05-02 16:46:55 +08001523{
James Simmons8d9de3f2016-06-10 16:13:39 -04001524 struct kib_fmr_pool *fpo;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001525 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001526
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001527 memset(fps, 0, sizeof(*fps));
Peng Taod7e09d02013-05-02 16:46:55 +08001528
1529 fps->fps_net = net;
1530 fps->fps_cpt = cpt;
Amir Shehata32c8deb82016-05-06 21:30:28 -04001531
1532 fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
1533 fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
1534 fps->fps_cache = tunables->lnd_fmr_cache;
1535
Peng Taod7e09d02013-05-02 16:46:55 +08001536 spin_lock_init(&fps->fps_lock);
1537 INIT_LIST_HEAD(&fps->fps_pool_list);
1538 INIT_LIST_HEAD(&fps->fps_failed_pool_list);
1539
1540 rc = kiblnd_create_fmr_pool(fps, &fpo);
James Simmons5fd88332016-02-12 12:06:09 -05001541 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08001542 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1543
1544 return rc;
1545}
1546
James Simmons8d9de3f2016-06-10 16:13:39 -04001547static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now)
Peng Taod7e09d02013-05-02 16:46:55 +08001548{
James Simmons5fd88332016-02-12 12:06:09 -05001549 if (fpo->fpo_map_count) /* still in use */
Peng Taod7e09d02013-05-02 16:46:55 +08001550 return 0;
1551 if (fpo->fpo_failed)
1552 return 1;
1553 return cfs_time_aftereq(now, fpo->fpo_deadline);
1554}
1555
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001556static int
James Simmons8d9de3f2016-06-10 16:13:39 -04001557kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd)
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001558{
1559 __u64 *pages = tx->tx_pages;
James Simmons8d9de3f2016-06-10 16:13:39 -04001560 struct kib_hca_dev *hdev;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001561 int npages;
1562 int size;
1563 int i;
1564
1565 hdev = tx->tx_pool->tpo_hdev;
1566
1567 for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
1568 for (size = 0; size < rd->rd_frags[i].rf_nob;
1569 size += hdev->ibh_page_size) {
1570 pages[npages++] = (rd->rd_frags[i].rf_addr &
1571 hdev->ibh_page_mask) + size;
1572 }
1573 }
1574
1575 return npages;
1576}
1577
James Simmons8d9de3f2016-06-10 16:13:39 -04001578void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status)
Peng Taod7e09d02013-05-02 16:46:55 +08001579{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001580 LIST_HEAD(zombies);
James Simmons8d9de3f2016-06-10 16:13:39 -04001581 struct kib_fmr_pool *fpo = fmr->fmr_pool;
1582 struct kib_fmr_poolset *fps;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001583 unsigned long now = cfs_time_current();
James Simmons8d9de3f2016-06-10 16:13:39 -04001584 struct kib_fmr_pool *tmp;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001585 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001586
Dmitry Eremin1f199a02016-05-05 14:53:05 -04001587 if (!fpo)
1588 return;
Peng Taod7e09d02013-05-02 16:46:55 +08001589
Dmitry Eremin1f199a02016-05-05 14:53:05 -04001590 fps = fpo->fpo_owner;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001591 if (fpo->fpo_is_fmr) {
1592 if (fmr->fmr_pfmr) {
1593 rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
1594 LASSERT(!rc);
1595 fmr->fmr_pfmr = NULL;
1596 }
Peng Taod7e09d02013-05-02 16:46:55 +08001597
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001598 if (status) {
1599 rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
1600 LASSERT(!rc);
1601 }
1602 } else {
1603 struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
Peng Taod7e09d02013-05-02 16:46:55 +08001604
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001605 if (frd) {
1606 frd->frd_valid = false;
1607 spin_lock(&fps->fps_lock);
1608 list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
1609 spin_unlock(&fps->fps_lock);
1610 fmr->fmr_frd = NULL;
1611 }
Peng Taod7e09d02013-05-02 16:46:55 +08001612 }
Peng Taod7e09d02013-05-02 16:46:55 +08001613 fmr->fmr_pool = NULL;
Peng Taod7e09d02013-05-02 16:46:55 +08001614
1615 spin_lock(&fps->fps_lock);
Igor Ishchenko747327972015-01-12 18:16:26 +02001616 fpo->fpo_map_count--; /* decref the pool */
Peng Taod7e09d02013-05-02 16:46:55 +08001617
1618 list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) {
1619 /* the first pool is persistent */
1620 if (fps->fps_pool_list.next == &fpo->fpo_list)
1621 continue;
1622
1623 if (kiblnd_fmr_pool_is_idle(fpo, now)) {
1624 list_move(&fpo->fpo_list, &zombies);
Igor Ishchenko747327972015-01-12 18:16:26 +02001625 fps->fps_version++;
Peng Taod7e09d02013-05-02 16:46:55 +08001626 }
1627 }
1628 spin_unlock(&fps->fps_lock);
1629
1630 if (!list_empty(&zombies))
1631 kiblnd_destroy_fmr_pool_list(&zombies);
1632}
1633
James Simmons8d9de3f2016-06-10 16:13:39 -04001634int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx,
1635 struct kib_rdma_desc *rd, __u32 nob, __u64 iov,
1636 struct kib_fmr *fmr)
Peng Taod7e09d02013-05-02 16:46:55 +08001637{
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001638 __u64 *pages = tx->tx_pages;
1639 bool is_rx = (rd != tx->tx_rd);
1640 bool tx_pages_mapped = 0;
James Simmons8d9de3f2016-06-10 16:13:39 -04001641 struct kib_fmr_pool *fpo;
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001642 int npages = 0;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001643 __u64 version;
1644 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001645
1646 again:
1647 spin_lock(&fps->fps_lock);
1648 version = fps->fps_version;
1649 list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
1650 fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1651 fpo->fpo_map_count++;
Peng Taod7e09d02013-05-02 16:46:55 +08001652
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001653 if (fpo->fpo_is_fmr) {
1654 struct ib_pool_fmr *pfmr;
1655
1656 spin_unlock(&fps->fps_lock);
1657
1658 if (!tx_pages_mapped) {
1659 npages = kiblnd_map_tx_pages(tx, rd);
1660 tx_pages_mapped = 1;
1661 }
1662
1663 pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
1664 pages, npages, iov);
1665 if (likely(!IS_ERR(pfmr))) {
1666 fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
1667 pfmr->fmr->lkey;
1668 fmr->fmr_frd = NULL;
1669 fmr->fmr_pfmr = pfmr;
1670 fmr->fmr_pool = fpo;
1671 return 0;
1672 }
1673 rc = PTR_ERR(pfmr);
1674 } else {
1675 if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
1676 struct kib_fast_reg_descriptor *frd;
1677 struct ib_reg_wr *wr;
1678 struct ib_mr *mr;
1679 int n;
1680
1681 frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
1682 struct kib_fast_reg_descriptor,
1683 frd_list);
1684 list_del(&frd->frd_list);
1685 spin_unlock(&fps->fps_lock);
1686
1687 mr = frd->frd_mr;
1688
1689 if (!frd->frd_valid) {
1690 __u32 key = is_rx ? mr->rkey : mr->lkey;
1691 struct ib_send_wr *inv_wr;
1692
1693 inv_wr = &frd->frd_inv_wr;
1694 memset(inv_wr, 0, sizeof(*inv_wr));
1695 inv_wr->opcode = IB_WR_LOCAL_INV;
1696 inv_wr->wr_id = IBLND_WID_MR;
1697 inv_wr->ex.invalidate_rkey = key;
1698
1699 /* Bump the key */
1700 key = ib_inc_rkey(key);
1701 ib_update_fast_reg_key(mr, key);
1702 }
1703
1704 n = ib_map_mr_sg(mr, tx->tx_frags,
Linus Torvalds2f37dd12016-05-20 22:20:48 -07001705 tx->tx_nfrags, NULL, PAGE_SIZE);
Dmitry Eremin80e05b32016-05-05 14:53:07 -04001706 if (unlikely(n != tx->tx_nfrags)) {
1707 CERROR("Failed to map mr %d/%d elements\n",
1708 n, tx->tx_nfrags);
1709 return n < 0 ? n : -EINVAL;
1710 }
1711
1712 mr->iova = iov;
1713
1714 /* Prepare FastReg WR */
1715 wr = &frd->frd_fastreg_wr;
1716 memset(wr, 0, sizeof(*wr));
1717 wr->wr.opcode = IB_WR_REG_MR;
1718 wr->wr.wr_id = IBLND_WID_MR;
1719 wr->wr.num_sge = 0;
1720 wr->wr.send_flags = 0;
1721 wr->mr = mr;
1722 wr->key = is_rx ? mr->rkey : mr->lkey;
1723 wr->access = (IB_ACCESS_LOCAL_WRITE |
1724 IB_ACCESS_REMOTE_WRITE);
1725
1726 fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
1727 fmr->fmr_frd = frd;
1728 fmr->fmr_pfmr = NULL;
1729 fmr->fmr_pool = fpo;
1730 return 0;
1731 }
1732 spin_unlock(&fps->fps_lock);
1733 rc = -EBUSY;
Peng Taod7e09d02013-05-02 16:46:55 +08001734 }
1735
1736 spin_lock(&fps->fps_lock);
1737 fpo->fpo_map_count--;
Dmitry Ereminc1b2e0b2016-05-05 14:53:04 -04001738 if (rc != -EAGAIN) {
Peng Taod7e09d02013-05-02 16:46:55 +08001739 spin_unlock(&fps->fps_lock);
Dmitry Ereminc1b2e0b2016-05-05 14:53:04 -04001740 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001741 }
1742
1743 /* EAGAIN and ... */
1744 if (version != fps->fps_version) {
1745 spin_unlock(&fps->fps_lock);
1746 goto again;
1747 }
1748 }
1749
1750 if (fps->fps_increasing) {
1751 spin_unlock(&fps->fps_lock);
James Simmonsc314c312016-02-12 12:06:01 -05001752 CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n");
Peng Taod7e09d02013-05-02 16:46:55 +08001753 schedule();
1754 goto again;
Peng Taod7e09d02013-05-02 16:46:55 +08001755 }
1756
Greg Kroah-Hartman699503b2014-07-12 01:03:41 -07001757 if (time_before(cfs_time_current(), fps->fps_next_retry)) {
Peng Taod7e09d02013-05-02 16:46:55 +08001758 /* someone failed recently */
1759 spin_unlock(&fps->fps_lock);
1760 return -EAGAIN;
1761 }
1762
1763 fps->fps_increasing = 1;
1764 spin_unlock(&fps->fps_lock);
1765
1766 CDEBUG(D_NET, "Allocate new FMR pool\n");
1767 rc = kiblnd_create_fmr_pool(fps, &fpo);
1768 spin_lock(&fps->fps_lock);
1769 fps->fps_increasing = 0;
James Simmons5fd88332016-02-12 12:06:09 -05001770 if (!rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08001771 fps->fps_version++;
1772 list_add_tail(&fpo->fpo_list, &fps->fps_pool_list);
1773 } else {
1774 fps->fps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1775 }
1776 spin_unlock(&fps->fps_lock);
1777
1778 goto again;
1779}
1780
James Simmons8d9de3f2016-06-10 16:13:39 -04001781static void kiblnd_fini_pool(struct kib_pool *pool)
Peng Taod7e09d02013-05-02 16:46:55 +08001782{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001783 LASSERT(list_empty(&pool->po_free_list));
James Simmons5fd88332016-02-12 12:06:09 -05001784 LASSERT(!pool->po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001785
1786 CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name);
1787}
1788
James Simmons8d9de3f2016-06-10 16:13:39 -04001789static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size)
Peng Taod7e09d02013-05-02 16:46:55 +08001790{
1791 CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name);
1792
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001793 memset(pool, 0, sizeof(*pool));
Peng Taod7e09d02013-05-02 16:46:55 +08001794 INIT_LIST_HEAD(&pool->po_free_list);
1795 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1796 pool->po_owner = ps;
1797 pool->po_size = size;
1798}
1799
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001800static void kiblnd_destroy_pool_list(struct list_head *head)
Peng Taod7e09d02013-05-02 16:46:55 +08001801{
James Simmons8d9de3f2016-06-10 16:13:39 -04001802 struct kib_pool *pool;
Peng Taod7e09d02013-05-02 16:46:55 +08001803
1804 while (!list_empty(head)) {
James Simmons8d9de3f2016-06-10 16:13:39 -04001805 pool = list_entry(head->next, struct kib_pool, po_list);
Peng Taod7e09d02013-05-02 16:46:55 +08001806 list_del(&pool->po_list);
1807
James Simmons06ace262016-02-12 12:06:08 -05001808 LASSERT(pool->po_owner);
Peng Taod7e09d02013-05-02 16:46:55 +08001809 pool->po_owner->ps_pool_destroy(pool);
1810 }
1811}
1812
James Simmons8d9de3f2016-06-10 16:13:39 -04001813static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies)
Peng Taod7e09d02013-05-02 16:46:55 +08001814{
James Simmons06ace262016-02-12 12:06:08 -05001815 if (!ps->ps_net) /* intialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001816 return;
1817
1818 spin_lock(&ps->ps_lock);
1819 while (!list_empty(&ps->ps_pool_list)) {
James Simmons8d9de3f2016-06-10 16:13:39 -04001820 struct kib_pool *po = list_entry(ps->ps_pool_list.next,
1821 struct kib_pool, po_list);
Peng Taod7e09d02013-05-02 16:46:55 +08001822 po->po_failed = 1;
1823 list_del(&po->po_list);
James Simmons5fd88332016-02-12 12:06:09 -05001824 if (!po->po_allocated)
Peng Taod7e09d02013-05-02 16:46:55 +08001825 list_add(&po->po_list, zombies);
1826 else
1827 list_add(&po->po_list, &ps->ps_failed_pool_list);
1828 }
1829 spin_unlock(&ps->ps_lock);
1830}
1831
James Simmons8d9de3f2016-06-10 16:13:39 -04001832static void kiblnd_fini_poolset(struct kib_poolset *ps)
Peng Taod7e09d02013-05-02 16:46:55 +08001833{
James Simmons06ace262016-02-12 12:06:08 -05001834 if (ps->ps_net) { /* initialized? */
Peng Taod7e09d02013-05-02 16:46:55 +08001835 kiblnd_destroy_pool_list(&ps->ps_failed_pool_list);
1836 kiblnd_destroy_pool_list(&ps->ps_pool_list);
1837 }
1838}
1839
James Simmons8d9de3f2016-06-10 16:13:39 -04001840static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt,
1841 struct kib_net *net, char *name, int size,
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001842 kib_ps_pool_create_t po_create,
1843 kib_ps_pool_destroy_t po_destroy,
1844 kib_ps_node_init_t nd_init,
1845 kib_ps_node_fini_t nd_fini)
Peng Taod7e09d02013-05-02 16:46:55 +08001846{
James Simmons8d9de3f2016-06-10 16:13:39 -04001847 struct kib_pool *pool;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001848 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001849
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05001850 memset(ps, 0, sizeof(*ps));
Peng Taod7e09d02013-05-02 16:46:55 +08001851
Mike Shueyec3d17c2015-05-19 10:14:36 -04001852 ps->ps_cpt = cpt;
1853 ps->ps_net = net;
Peng Taod7e09d02013-05-02 16:46:55 +08001854 ps->ps_pool_create = po_create;
1855 ps->ps_pool_destroy = po_destroy;
1856 ps->ps_node_init = nd_init;
1857 ps->ps_node_fini = nd_fini;
1858 ps->ps_pool_size = size;
1859 if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name))
1860 >= sizeof(ps->ps_name))
1861 return -E2BIG;
1862 spin_lock_init(&ps->ps_lock);
1863 INIT_LIST_HEAD(&ps->ps_pool_list);
1864 INIT_LIST_HEAD(&ps->ps_failed_pool_list);
1865
1866 rc = ps->ps_pool_create(ps, size, &pool);
James Simmons5fd88332016-02-12 12:06:09 -05001867 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08001868 list_add(&pool->po_list, &ps->ps_pool_list);
1869 else
1870 CERROR("Failed to create the first pool for %s\n", ps->ps_name);
1871
1872 return rc;
1873}
1874
James Simmons8d9de3f2016-06-10 16:13:39 -04001875static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now)
Peng Taod7e09d02013-05-02 16:46:55 +08001876{
James Simmons5fd88332016-02-12 12:06:09 -05001877 if (pool->po_allocated) /* still in use */
Peng Taod7e09d02013-05-02 16:46:55 +08001878 return 0;
1879 if (pool->po_failed)
1880 return 1;
1881 return cfs_time_aftereq(now, pool->po_deadline);
1882}
1883
James Simmons8d9de3f2016-06-10 16:13:39 -04001884void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node)
Peng Taod7e09d02013-05-02 16:46:55 +08001885{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001886 LIST_HEAD(zombies);
James Simmons8d9de3f2016-06-10 16:13:39 -04001887 struct kib_poolset *ps = pool->po_owner;
1888 struct kib_pool *tmp;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001889 unsigned long now = cfs_time_current();
Peng Taod7e09d02013-05-02 16:46:55 +08001890
1891 spin_lock(&ps->ps_lock);
1892
James Simmons06ace262016-02-12 12:06:08 -05001893 if (ps->ps_node_fini)
Peng Taod7e09d02013-05-02 16:46:55 +08001894 ps->ps_node_fini(pool, node);
1895
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02001896 LASSERT(pool->po_allocated > 0);
Peng Taod7e09d02013-05-02 16:46:55 +08001897 list_add(node, &pool->po_free_list);
Igor Ishchenko747327972015-01-12 18:16:26 +02001898 pool->po_allocated--;
Peng Taod7e09d02013-05-02 16:46:55 +08001899
1900 list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) {
1901 /* the first pool is persistent */
1902 if (ps->ps_pool_list.next == &pool->po_list)
1903 continue;
1904
1905 if (kiblnd_pool_is_idle(pool, now))
1906 list_move(&pool->po_list, &zombies);
1907 }
1908 spin_unlock(&ps->ps_lock);
1909
1910 if (!list_empty(&zombies))
1911 kiblnd_destroy_pool_list(&zombies);
1912}
1913
James Simmons8d9de3f2016-06-10 16:13:39 -04001914struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps)
Peng Taod7e09d02013-05-02 16:46:55 +08001915{
Mike Shueyec3d17c2015-05-19 10:14:36 -04001916 struct list_head *node;
James Simmons8d9de3f2016-06-10 16:13:39 -04001917 struct kib_pool *pool;
Liang Zhenea363b42016-03-02 18:53:30 -05001918 unsigned int interval = 1;
1919 unsigned long time_before;
1920 unsigned int trips = 0;
Mike Shueyec3d17c2015-05-19 10:14:36 -04001921 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08001922
1923 again:
1924 spin_lock(&ps->ps_lock);
1925 list_for_each_entry(pool, &ps->ps_pool_list, po_list) {
1926 if (list_empty(&pool->po_free_list))
1927 continue;
1928
Igor Ishchenko747327972015-01-12 18:16:26 +02001929 pool->po_allocated++;
Peng Taod7e09d02013-05-02 16:46:55 +08001930 pool->po_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
1931 node = pool->po_free_list.next;
1932 list_del(node);
1933
James Simmons06ace262016-02-12 12:06:08 -05001934 if (ps->ps_node_init) {
Peng Taod7e09d02013-05-02 16:46:55 +08001935 /* still hold the lock */
1936 ps->ps_node_init(pool, node);
1937 }
1938 spin_unlock(&ps->ps_lock);
1939 return node;
1940 }
1941
1942 /* no available tx pool and ... */
1943 if (ps->ps_increasing) {
1944 /* another thread is allocating a new pool */
1945 spin_unlock(&ps->ps_lock);
Liang Zhenea363b42016-03-02 18:53:30 -05001946 trips++;
1947 CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n",
1948 ps->ps_name, interval, trips);
1949
1950 set_current_state(TASK_INTERRUPTIBLE);
1951 schedule_timeout(interval);
1952 if (interval < cfs_time_seconds(1))
1953 interval *= 2;
1954
Peng Taod7e09d02013-05-02 16:46:55 +08001955 goto again;
1956 }
1957
Greg Kroah-Hartman699503b2014-07-12 01:03:41 -07001958 if (time_before(cfs_time_current(), ps->ps_next_retry)) {
Peng Taod7e09d02013-05-02 16:46:55 +08001959 /* someone failed recently */
1960 spin_unlock(&ps->ps_lock);
1961 return NULL;
1962 }
1963
1964 ps->ps_increasing = 1;
1965 spin_unlock(&ps->ps_lock);
1966
1967 CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name);
Liang Zhenea363b42016-03-02 18:53:30 -05001968 time_before = cfs_time_current();
Peng Taod7e09d02013-05-02 16:46:55 +08001969 rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool);
Liang Zhenea363b42016-03-02 18:53:30 -05001970 CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete",
1971 cfs_time_current() - time_before);
Peng Taod7e09d02013-05-02 16:46:55 +08001972
1973 spin_lock(&ps->ps_lock);
1974 ps->ps_increasing = 0;
James Simmons5fd88332016-02-12 12:06:09 -05001975 if (!rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08001976 list_add_tail(&pool->po_list, &ps->ps_pool_list);
1977 } else {
1978 ps->ps_next_retry = cfs_time_shift(IBLND_POOL_RETRY);
1979 CERROR("Can't allocate new %s pool because out of memory\n",
1980 ps->ps_name);
1981 }
1982 spin_unlock(&ps->ps_lock);
1983
1984 goto again;
1985}
1986
James Simmons8d9de3f2016-06-10 16:13:39 -04001987static void kiblnd_destroy_tx_pool(struct kib_pool *pool)
Peng Taod7e09d02013-05-02 16:46:55 +08001988{
James Simmons8d9de3f2016-06-10 16:13:39 -04001989 struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool);
Mike Shueyec3d17c2015-05-19 10:14:36 -04001990 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08001991
James Simmons5fd88332016-02-12 12:06:09 -05001992 LASSERT(!pool->po_allocated);
Peng Taod7e09d02013-05-02 16:46:55 +08001993
James Simmons06ace262016-02-12 12:06:08 -05001994 if (tpo->tpo_tx_pages) {
Peng Taod7e09d02013-05-02 16:46:55 +08001995 kiblnd_unmap_tx_pool(tpo);
1996 kiblnd_free_pages(tpo->tpo_tx_pages);
1997 }
1998
James Simmons06ace262016-02-12 12:06:08 -05001999 if (!tpo->tpo_tx_descs)
Peng Taod7e09d02013-05-02 16:46:55 +08002000 goto out;
2001
2002 for (i = 0; i < pool->po_size; i++) {
James Simmons8d9de3f2016-06-10 16:13:39 -04002003 struct kib_tx *tx = &tpo->tpo_tx_descs[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002004
2005 list_del(&tx->tx_list);
James Simmons06ace262016-02-12 12:06:08 -05002006 if (tx->tx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +08002007 LIBCFS_FREE(tx->tx_pages,
2008 LNET_MAX_IOV *
2009 sizeof(*tx->tx_pages));
James Simmons06ace262016-02-12 12:06:08 -05002010 if (tx->tx_frags)
Peng Taod7e09d02013-05-02 16:46:55 +08002011 LIBCFS_FREE(tx->tx_frags,
James Simmons147280d2016-05-09 10:53:48 -04002012 (1 + IBLND_MAX_RDMA_FRAGS) *
2013 sizeof(*tx->tx_frags));
James Simmons06ace262016-02-12 12:06:08 -05002014 if (tx->tx_wrq)
Peng Taod7e09d02013-05-02 16:46:55 +08002015 LIBCFS_FREE(tx->tx_wrq,
2016 (1 + IBLND_MAX_RDMA_FRAGS) *
2017 sizeof(*tx->tx_wrq));
James Simmons06ace262016-02-12 12:06:08 -05002018 if (tx->tx_sge)
Peng Taod7e09d02013-05-02 16:46:55 +08002019 LIBCFS_FREE(tx->tx_sge,
2020 (1 + IBLND_MAX_RDMA_FRAGS) *
2021 sizeof(*tx->tx_sge));
James Simmons06ace262016-02-12 12:06:08 -05002022 if (tx->tx_rd)
Peng Taod7e09d02013-05-02 16:46:55 +08002023 LIBCFS_FREE(tx->tx_rd,
James Simmons8d9de3f2016-06-10 16:13:39 -04002024 offsetof(struct kib_rdma_desc,
Peng Taod7e09d02013-05-02 16:46:55 +08002025 rd_frags[IBLND_MAX_RDMA_FRAGS]));
2026 }
2027
2028 LIBCFS_FREE(tpo->tpo_tx_descs,
James Simmons8d9de3f2016-06-10 16:13:39 -04002029 pool->po_size * sizeof(struct kib_tx));
Peng Taod7e09d02013-05-02 16:46:55 +08002030out:
2031 kiblnd_fini_pool(pool);
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05002032 LIBCFS_FREE(tpo, sizeof(*tpo));
Peng Taod7e09d02013-05-02 16:46:55 +08002033}
2034
2035static int kiblnd_tx_pool_size(int ncpts)
2036{
2037 int ntx = *kiblnd_tunables.kib_ntx / ncpts;
2038
2039 return max(IBLND_TX_POOL, ntx);
2040}
2041
James Simmons8d9de3f2016-06-10 16:13:39 -04002042static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size,
2043 struct kib_pool **pp_po)
Peng Taod7e09d02013-05-02 16:46:55 +08002044{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002045 int i;
2046 int npg;
James Simmons8d9de3f2016-06-10 16:13:39 -04002047 struct kib_pool *pool;
2048 struct kib_tx_pool *tpo;
Peng Taod7e09d02013-05-02 16:46:55 +08002049
2050 LIBCFS_CPT_ALLOC(tpo, lnet_cpt_table(), ps->ps_cpt, sizeof(*tpo));
James Simmons06ace262016-02-12 12:06:08 -05002051 if (!tpo) {
Peng Taod7e09d02013-05-02 16:46:55 +08002052 CERROR("Failed to allocate TX pool\n");
2053 return -ENOMEM;
2054 }
2055
2056 pool = &tpo->tpo_pool;
2057 kiblnd_init_pool(ps, pool, size);
2058 tpo->tpo_tx_descs = NULL;
2059 tpo->tpo_tx_pages = NULL;
2060
2061 npg = (size * IBLND_MSG_SIZE + PAGE_SIZE - 1) / PAGE_SIZE;
James Simmons5fd88332016-02-12 12:06:09 -05002062 if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002063 CERROR("Can't allocate tx pages: %d\n", npg);
Janani Ravichandrana4e872f2016-02-10 22:47:33 -05002064 LIBCFS_FREE(tpo, sizeof(*tpo));
Peng Taod7e09d02013-05-02 16:46:55 +08002065 return -ENOMEM;
2066 }
2067
2068 LIBCFS_CPT_ALLOC(tpo->tpo_tx_descs, lnet_cpt_table(), ps->ps_cpt,
James Simmons8d9de3f2016-06-10 16:13:39 -04002069 size * sizeof(struct kib_tx));
James Simmons06ace262016-02-12 12:06:08 -05002070 if (!tpo->tpo_tx_descs) {
Peng Taod7e09d02013-05-02 16:46:55 +08002071 CERROR("Can't allocate %d tx descriptors\n", size);
2072 ps->ps_pool_destroy(pool);
2073 return -ENOMEM;
2074 }
2075
James Simmons8d9de3f2016-06-10 16:13:39 -04002076 memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx));
Peng Taod7e09d02013-05-02 16:46:55 +08002077
2078 for (i = 0; i < size; i++) {
James Simmons8d9de3f2016-06-10 16:13:39 -04002079 struct kib_tx *tx = &tpo->tpo_tx_descs[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002080
2081 tx->tx_pool = tpo;
James Simmons06ace262016-02-12 12:06:08 -05002082 if (ps->ps_net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002083 LIBCFS_CPT_ALLOC(tx->tx_pages,
2084 lnet_cpt_table(), ps->ps_cpt,
2085 LNET_MAX_IOV * sizeof(*tx->tx_pages));
James Simmons06ace262016-02-12 12:06:08 -05002086 if (!tx->tx_pages)
Peng Taod7e09d02013-05-02 16:46:55 +08002087 break;
2088 }
2089
2090 LIBCFS_CPT_ALLOC(tx->tx_frags, lnet_cpt_table(), ps->ps_cpt,
James Simmons147280d2016-05-09 10:53:48 -04002091 (1 + IBLND_MAX_RDMA_FRAGS) *
2092 sizeof(*tx->tx_frags));
James Simmons06ace262016-02-12 12:06:08 -05002093 if (!tx->tx_frags)
Peng Taod7e09d02013-05-02 16:46:55 +08002094 break;
2095
James Simmons147280d2016-05-09 10:53:48 -04002096 sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1);
Peng Taod7e09d02013-05-02 16:46:55 +08002097
2098 LIBCFS_CPT_ALLOC(tx->tx_wrq, lnet_cpt_table(), ps->ps_cpt,
2099 (1 + IBLND_MAX_RDMA_FRAGS) *
2100 sizeof(*tx->tx_wrq));
James Simmons06ace262016-02-12 12:06:08 -05002101 if (!tx->tx_wrq)
Peng Taod7e09d02013-05-02 16:46:55 +08002102 break;
2103
2104 LIBCFS_CPT_ALLOC(tx->tx_sge, lnet_cpt_table(), ps->ps_cpt,
2105 (1 + IBLND_MAX_RDMA_FRAGS) *
2106 sizeof(*tx->tx_sge));
James Simmons06ace262016-02-12 12:06:08 -05002107 if (!tx->tx_sge)
Peng Taod7e09d02013-05-02 16:46:55 +08002108 break;
2109
2110 LIBCFS_CPT_ALLOC(tx->tx_rd, lnet_cpt_table(), ps->ps_cpt,
James Simmons8d9de3f2016-06-10 16:13:39 -04002111 offsetof(struct kib_rdma_desc,
Peng Taod7e09d02013-05-02 16:46:55 +08002112 rd_frags[IBLND_MAX_RDMA_FRAGS]));
James Simmons06ace262016-02-12 12:06:08 -05002113 if (!tx->tx_rd)
Peng Taod7e09d02013-05-02 16:46:55 +08002114 break;
2115 }
2116
2117 if (i == size) {
2118 kiblnd_map_tx_pool(tpo);
2119 *pp_po = pool;
2120 return 0;
2121 }
2122
2123 ps->ps_pool_destroy(pool);
2124 return -ENOMEM;
2125}
2126
James Simmons8d9de3f2016-06-10 16:13:39 -04002127static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node)
Peng Taod7e09d02013-05-02 16:46:55 +08002128{
James Simmons8d9de3f2016-06-10 16:13:39 -04002129 struct kib_tx_poolset *tps = container_of(pool->po_owner,
2130 struct kib_tx_poolset,
2131 tps_poolset);
2132 struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list);
Peng Taod7e09d02013-05-02 16:46:55 +08002133
Igor Ishchenko747327972015-01-12 18:16:26 +02002134 tx->tx_cookie = tps->tps_next_tx_cookie++;
Peng Taod7e09d02013-05-02 16:46:55 +08002135}
2136
James Simmons8d9de3f2016-06-10 16:13:39 -04002137static void kiblnd_net_fini_pools(struct kib_net *net)
Peng Taod7e09d02013-05-02 16:46:55 +08002138{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002139 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002140
2141 cfs_cpt_for_each(i, lnet_cpt_table()) {
James Simmons8d9de3f2016-06-10 16:13:39 -04002142 struct kib_tx_poolset *tps;
2143 struct kib_fmr_poolset *fps;
Peng Taod7e09d02013-05-02 16:46:55 +08002144
James Simmons06ace262016-02-12 12:06:08 -05002145 if (net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002146 tps = net->ibn_tx_ps[i];
2147 kiblnd_fini_poolset(&tps->tps_poolset);
2148 }
2149
James Simmons06ace262016-02-12 12:06:08 -05002150 if (net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002151 fps = net->ibn_fmr_ps[i];
2152 kiblnd_fini_fmr_poolset(fps);
2153 }
Peng Taod7e09d02013-05-02 16:46:55 +08002154 }
2155
James Simmons06ace262016-02-12 12:06:08 -05002156 if (net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002157 cfs_percpt_free(net->ibn_tx_ps);
2158 net->ibn_tx_ps = NULL;
2159 }
2160
James Simmons06ace262016-02-12 12:06:08 -05002161 if (net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002162 cfs_percpt_free(net->ibn_fmr_ps);
2163 net->ibn_fmr_ps = NULL;
2164 }
Peng Taod7e09d02013-05-02 16:46:55 +08002165}
2166
James Simmons8d9de3f2016-06-10 16:13:39 -04002167static int kiblnd_net_init_pools(struct kib_net *net, lnet_ni_t *ni, __u32 *cpts,
Amir Shehata32c8deb82016-05-06 21:30:28 -04002168 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08002169{
Amir Shehata32c8deb82016-05-06 21:30:28 -04002170 struct lnet_ioctl_config_o2iblnd_tunables *tunables;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002171 unsigned long flags;
2172 int cpt;
Amir Shehata32c8deb82016-05-06 21:30:28 -04002173 int rc;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002174 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002175
Amir Shehata32c8deb82016-05-06 21:30:28 -04002176 tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
2177
Peng Taod7e09d02013-05-02 16:46:55 +08002178 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
Amir Shehata32c8deb82016-05-06 21:30:28 -04002179 if (!tunables->lnd_map_on_demand) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002180 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
Peng Taod7e09d02013-05-02 16:46:55 +08002181 goto create_tx_pool;
2182 }
2183
2184 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2185
Amir Shehata32c8deb82016-05-06 21:30:28 -04002186 if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
Peng Taod7e09d02013-05-02 16:46:55 +08002187 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
Amir Shehata32c8deb82016-05-06 21:30:28 -04002188 tunables->lnd_fmr_pool_size,
Peng Taod7e09d02013-05-02 16:46:55 +08002189 *kiblnd_tunables.kib_ntx / 4);
2190 rc = -EINVAL;
2191 goto failed;
2192 }
2193
Oleg Drokin415bcb52015-08-18 21:04:35 -04002194 /*
2195 * TX pool must be created later than FMR, see LU-2268
2196 * for details
2197 */
James Simmons06ace262016-02-12 12:06:08 -05002198 LASSERT(!net->ibn_tx_ps);
Peng Taod7e09d02013-05-02 16:46:55 +08002199
Oleg Drokin415bcb52015-08-18 21:04:35 -04002200 /*
2201 * premapping can fail if ibd_nmr > 1, so we always create
2202 * FMR pool and map-on-demand if premapping failed
James Simmons7e221b62016-03-24 11:24:02 -04002203 *
2204 * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
2205 * The number of struct kib_fmr_poolsets create is equal to the
2206 * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
Oleg Drokin415bcb52015-08-18 21:04:35 -04002207 */
Peng Taod7e09d02013-05-02 16:46:55 +08002208 net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
James Simmons8d9de3f2016-06-10 16:13:39 -04002209 sizeof(struct kib_fmr_poolset));
James Simmons06ace262016-02-12 12:06:08 -05002210 if (!net->ibn_fmr_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002211 CERROR("Failed to allocate FMR pool array\n");
2212 rc = -ENOMEM;
2213 goto failed;
2214 }
2215
2216 for (i = 0; i < ncpts; i++) {
James Simmons06ace262016-02-12 12:06:08 -05002217 cpt = !cpts ? i : cpts[i];
Amir Shehata32c8deb82016-05-06 21:30:28 -04002218 rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
2219 net, tunables);
Amir Shehata7cadcc72016-03-02 17:02:03 -05002220 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002221 CERROR("Can't initialize FMR pool for CPT %d: %d\n",
2222 cpt, rc);
2223 goto failed;
2224 }
2225 }
2226
Amir Shehata7cadcc72016-03-02 17:02:03 -05002227 if (i > 0)
Peng Taod7e09d02013-05-02 16:46:55 +08002228 LASSERT(i == ncpts);
Peng Taod7e09d02013-05-02 16:46:55 +08002229
2230 create_tx_pool:
James Simmons7e221b62016-03-24 11:24:02 -04002231 /*
2232 * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
2233 * The number of struct kib_tx_poolsets create is equal to the
2234 * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
2235 */
Peng Taod7e09d02013-05-02 16:46:55 +08002236 net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
James Simmons8d9de3f2016-06-10 16:13:39 -04002237 sizeof(struct kib_tx_poolset));
James Simmons06ace262016-02-12 12:06:08 -05002238 if (!net->ibn_tx_ps) {
Peng Taod7e09d02013-05-02 16:46:55 +08002239 CERROR("Failed to allocate tx pool array\n");
2240 rc = -ENOMEM;
2241 goto failed;
2242 }
2243
2244 for (i = 0; i < ncpts; i++) {
James Simmons06ace262016-02-12 12:06:08 -05002245 cpt = !cpts ? i : cpts[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002246 rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset,
2247 cpt, net, "TX",
2248 kiblnd_tx_pool_size(ncpts),
2249 kiblnd_create_tx_pool,
2250 kiblnd_destroy_tx_pool,
2251 kiblnd_tx_init, NULL);
James Simmons5fd88332016-02-12 12:06:09 -05002252 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002253 CERROR("Can't initialize TX pool for CPT %d: %d\n",
2254 cpt, rc);
2255 goto failed;
2256 }
2257 }
2258
2259 return 0;
2260 failed:
2261 kiblnd_net_fini_pools(net);
James Simmons5fd88332016-02-12 12:06:09 -05002262 LASSERT(rc);
Peng Taod7e09d02013-05-02 16:46:55 +08002263 return rc;
2264}
2265
James Simmons8d9de3f2016-06-10 16:13:39 -04002266static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002267{
James Simmons4420cfd2016-02-12 12:06:00 -05002268 /*
2269 * It's safe to assume a HCA can handle a page size
2270 * matching that of the native system
2271 */
Peng Taod7e09d02013-05-02 16:46:55 +08002272 hdev->ibh_page_shift = PAGE_SHIFT;
2273 hdev->ibh_page_size = 1 << PAGE_SHIFT;
2274 hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1);
2275
Or Gerlitzcebfe5c2015-12-18 10:59:49 +02002276 hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size;
Peng Taod7e09d02013-05-02 16:46:55 +08002277 if (hdev->ibh_mr_size == ~0ULL) {
2278 hdev->ibh_mr_shift = 64;
2279 return 0;
2280 }
2281
Greg Kroah-Hartman55f5a822014-07-12 20:26:07 -07002282 CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size);
Peng Taod7e09d02013-05-02 16:46:55 +08002283 return -EINVAL;
2284}
2285
James Simmons8d9de3f2016-06-10 16:13:39 -04002286static void kiblnd_hdev_cleanup_mrs(struct kib_hca_dev *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002287{
Amir Shehata7cadcc72016-03-02 17:02:03 -05002288 if (!hdev->ibh_mrs)
Peng Taod7e09d02013-05-02 16:46:55 +08002289 return;
2290
Amir Shehata7cadcc72016-03-02 17:02:03 -05002291 ib_dereg_mr(hdev->ibh_mrs);
Peng Taod7e09d02013-05-02 16:46:55 +08002292
Amir Shehata7cadcc72016-03-02 17:02:03 -05002293 hdev->ibh_mrs = NULL;
Peng Taod7e09d02013-05-02 16:46:55 +08002294}
2295
James Simmons8d9de3f2016-06-10 16:13:39 -04002296void kiblnd_hdev_destroy(struct kib_hca_dev *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002297{
2298 kiblnd_hdev_cleanup_mrs(hdev);
2299
James Simmons06ace262016-02-12 12:06:08 -05002300 if (hdev->ibh_pd)
Peng Taod7e09d02013-05-02 16:46:55 +08002301 ib_dealloc_pd(hdev->ibh_pd);
2302
James Simmons06ace262016-02-12 12:06:08 -05002303 if (hdev->ibh_cmid)
Peng Taod7e09d02013-05-02 16:46:55 +08002304 rdma_destroy_id(hdev->ibh_cmid);
2305
2306 LIBCFS_FREE(hdev, sizeof(*hdev));
2307}
2308
James Simmons8d9de3f2016-06-10 16:13:39 -04002309static int kiblnd_hdev_setup_mrs(struct kib_hca_dev *hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002310{
2311 struct ib_mr *mr;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002312 int rc;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002313 int acflags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
Peng Taod7e09d02013-05-02 16:46:55 +08002314
2315 rc = kiblnd_hdev_get_attr(hdev);
James Simmons5fd88332016-02-12 12:06:09 -05002316 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002317 return rc;
2318
Luis de Bethencourt01738442015-10-21 18:40:40 +01002319 mr = ib_get_dma_mr(hdev->ibh_pd, acflags);
2320 if (IS_ERR(mr)) {
2321 CERROR("Failed ib_get_dma_mr : %ld\n", PTR_ERR(mr));
2322 kiblnd_hdev_cleanup_mrs(hdev);
2323 return PTR_ERR(mr);
2324 }
Peng Taod7e09d02013-05-02 16:46:55 +08002325
Amir Shehata7cadcc72016-03-02 17:02:03 -05002326 hdev->ibh_mrs = mr;
Peng Taod7e09d02013-05-02 16:46:55 +08002327
Peng Taod7e09d02013-05-02 16:46:55 +08002328 return 0;
2329}
2330
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002331/* DUMMY */
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002332static int kiblnd_dummy_callback(struct rdma_cm_id *cmid,
2333 struct rdma_cm_event *event)
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002334{
Peng Taod7e09d02013-05-02 16:46:55 +08002335 return 0;
2336}
2337
James Simmons8d9de3f2016-06-10 16:13:39 -04002338static int kiblnd_dev_need_failover(struct kib_dev *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002339{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002340 struct rdma_cm_id *cmid;
2341 struct sockaddr_in srcaddr;
2342 struct sockaddr_in dstaddr;
2343 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002344
James Simmons06ace262016-02-12 12:06:08 -05002345 if (!dev->ibd_hdev || /* initializing */
2346 !dev->ibd_hdev->ibh_cmid || /* listener is dead */
Peng Taod7e09d02013-05-02 16:46:55 +08002347 *kiblnd_tunables.kib_dev_failover > 1) /* debugging */
2348 return 1;
2349
James Simmons4420cfd2016-02-12 12:06:00 -05002350 /*
2351 * XXX: it's UGLY, but I don't have better way to find
Peng Taod7e09d02013-05-02 16:46:55 +08002352 * ib-bonding HCA failover because:
2353 *
2354 * a. no reliable CM event for HCA failover...
2355 * b. no OFED API to get ib_device for current net_device...
2356 *
2357 * We have only two choices at this point:
2358 *
2359 * a. rdma_bind_addr(), it will conflict with listener cmid
James Simmons4420cfd2016-02-12 12:06:00 -05002360 * b. rdma_resolve_addr() to zero addr
2361 */
Peng Taod7e09d02013-05-02 16:46:55 +08002362 cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP,
2363 IB_QPT_RC);
2364 if (IS_ERR(cmid)) {
2365 rc = PTR_ERR(cmid);
2366 CERROR("Failed to create cmid for failover: %d\n", rc);
2367 return rc;
2368 }
2369
2370 memset(&srcaddr, 0, sizeof(srcaddr));
Mike Shueyec3d17c2015-05-19 10:14:36 -04002371 srcaddr.sin_family = AF_INET;
Peng Taod7e09d02013-05-02 16:46:55 +08002372 srcaddr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2373
2374 memset(&dstaddr, 0, sizeof(dstaddr));
2375 dstaddr.sin_family = AF_INET;
2376 rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr,
2377 (struct sockaddr *)&dstaddr, 1);
James Simmons5fd88332016-02-12 12:06:09 -05002378 if (rc || !cmid->device) {
Peng Tao5e8f6922013-07-15 22:27:09 +08002379 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2380 dev->ibd_ifname, &dev->ibd_ifip,
Peng Taod7e09d02013-05-02 16:46:55 +08002381 cmid->device, rc);
2382 rdma_destroy_id(cmid);
2383 return rc;
2384 }
2385
Liang Zhen199a0cc2015-09-14 18:41:33 -04002386 rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */
2387 rdma_destroy_id(cmid);
Peng Taod7e09d02013-05-02 16:46:55 +08002388
Liang Zhen199a0cc2015-09-14 18:41:33 -04002389 return rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002390}
2391
James Simmons8d9de3f2016-06-10 16:13:39 -04002392int kiblnd_dev_failover(struct kib_dev *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002393{
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002394 LIST_HEAD(zombie_tpo);
2395 LIST_HEAD(zombie_ppo);
2396 LIST_HEAD(zombie_fpo);
Mike Shueyec3d17c2015-05-19 10:14:36 -04002397 struct rdma_cm_id *cmid = NULL;
James Simmons8d9de3f2016-06-10 16:13:39 -04002398 struct kib_hca_dev *hdev = NULL;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002399 struct ib_pd *pd;
James Simmons8d9de3f2016-06-10 16:13:39 -04002400 struct kib_net *net;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002401 struct sockaddr_in addr;
2402 unsigned long flags;
2403 int rc = 0;
2404 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002405
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002406 LASSERT(*kiblnd_tunables.kib_dev_failover > 1 ||
James Simmons06ace262016-02-12 12:06:08 -05002407 dev->ibd_can_failover || !dev->ibd_hdev);
Peng Taod7e09d02013-05-02 16:46:55 +08002408
2409 rc = kiblnd_dev_need_failover(dev);
2410 if (rc <= 0)
2411 goto out;
2412
James Simmons06ace262016-02-12 12:06:08 -05002413 if (dev->ibd_hdev &&
2414 dev->ibd_hdev->ibh_cmid) {
James Simmons4420cfd2016-02-12 12:06:00 -05002415 /*
2416 * XXX it's not good to close old listener at here,
Peng Taod7e09d02013-05-02 16:46:55 +08002417 * because we can fail to create new listener.
2418 * But we have to close it now, otherwise rdma_bind_addr
James Simmons4420cfd2016-02-12 12:06:00 -05002419 * will return EADDRINUSE... How crap!
2420 */
Peng Taod7e09d02013-05-02 16:46:55 +08002421 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2422
2423 cmid = dev->ibd_hdev->ibh_cmid;
James Simmons4420cfd2016-02-12 12:06:00 -05002424 /*
2425 * make next schedule of kiblnd_dev_need_failover()
2426 * return 1 for me
2427 */
Peng Taod7e09d02013-05-02 16:46:55 +08002428 dev->ibd_hdev->ibh_cmid = NULL;
2429 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2430
2431 rdma_destroy_id(cmid);
2432 }
2433
2434 cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP,
2435 IB_QPT_RC);
2436 if (IS_ERR(cmid)) {
2437 rc = PTR_ERR(cmid);
2438 CERROR("Failed to create cmid for failover: %d\n", rc);
2439 goto out;
2440 }
2441
2442 memset(&addr, 0, sizeof(addr));
2443 addr.sin_family = AF_INET;
2444 addr.sin_addr.s_addr = (__force u32)htonl(dev->ibd_ifip);
2445 addr.sin_port = htons(*kiblnd_tunables.kib_service);
2446
2447 /* Bind to failover device or port */
2448 rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr);
James Simmons5fd88332016-02-12 12:06:09 -05002449 if (rc || !cmid->device) {
Peng Tao5e8f6922013-07-15 22:27:09 +08002450 CERROR("Failed to bind %s:%pI4h to device(%p): %d\n",
2451 dev->ibd_ifname, &dev->ibd_ifip,
Peng Taod7e09d02013-05-02 16:46:55 +08002452 cmid->device, rc);
2453 rdma_destroy_id(cmid);
2454 goto out;
2455 }
2456
2457 LIBCFS_ALLOC(hdev, sizeof(*hdev));
James Simmons06ace262016-02-12 12:06:08 -05002458 if (!hdev) {
Peng Taod7e09d02013-05-02 16:46:55 +08002459 CERROR("Failed to allocate kib_hca_dev\n");
2460 rdma_destroy_id(cmid);
2461 rc = -ENOMEM;
2462 goto out;
2463 }
2464
2465 atomic_set(&hdev->ibh_ref, 1);
2466 hdev->ibh_dev = dev;
2467 hdev->ibh_cmid = cmid;
2468 hdev->ibh_ibdev = cmid->device;
2469
Christoph Hellwiged082d32016-09-05 12:56:17 +02002470 pd = ib_alloc_pd(cmid->device, 0);
Peng Taod7e09d02013-05-02 16:46:55 +08002471 if (IS_ERR(pd)) {
2472 rc = PTR_ERR(pd);
2473 CERROR("Can't allocate PD: %d\n", rc);
2474 goto out;
2475 }
2476
2477 hdev->ibh_pd = pd;
2478
2479 rc = rdma_listen(cmid, 0);
James Simmons5fd88332016-02-12 12:06:09 -05002480 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002481 CERROR("Can't start new listener: %d\n", rc);
2482 goto out;
2483 }
2484
2485 rc = kiblnd_hdev_setup_mrs(hdev);
James Simmons5fd88332016-02-12 12:06:09 -05002486 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002487 CERROR("Can't setup device: %d\n", rc);
2488 goto out;
2489 }
2490
2491 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2492
Fabian Frederick6d37b172015-06-10 18:32:21 +02002493 swap(dev->ibd_hdev, hdev); /* take over the refcount */
Peng Taod7e09d02013-05-02 16:46:55 +08002494
2495 list_for_each_entry(net, &dev->ibd_nets, ibn_list) {
2496 cfs_cpt_for_each(i, lnet_cpt_table()) {
2497 kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset,
2498 &zombie_tpo);
2499
Oleg Drokin415bcb52015-08-18 21:04:35 -04002500 if (net->ibn_fmr_ps)
Peng Taod7e09d02013-05-02 16:46:55 +08002501 kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i],
2502 &zombie_fpo);
Peng Taod7e09d02013-05-02 16:46:55 +08002503 }
2504 }
2505
2506 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2507 out:
2508 if (!list_empty(&zombie_tpo))
2509 kiblnd_destroy_pool_list(&zombie_tpo);
2510 if (!list_empty(&zombie_ppo))
2511 kiblnd_destroy_pool_list(&zombie_ppo);
2512 if (!list_empty(&zombie_fpo))
2513 kiblnd_destroy_fmr_pool_list(&zombie_fpo);
James Simmons06ace262016-02-12 12:06:08 -05002514 if (hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002515 kiblnd_hdev_decref(hdev);
2516
James Simmons5fd88332016-02-12 12:06:09 -05002517 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002518 dev->ibd_failed_failover++;
2519 else
2520 dev->ibd_failed_failover = 0;
2521
2522 return rc;
2523}
2524
James Simmons8d9de3f2016-06-10 16:13:39 -04002525void kiblnd_destroy_dev(struct kib_dev *dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002526{
James Simmons5fd88332016-02-12 12:06:09 -05002527 LASSERT(!dev->ibd_nnets);
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002528 LASSERT(list_empty(&dev->ibd_nets));
Peng Taod7e09d02013-05-02 16:46:55 +08002529
2530 list_del(&dev->ibd_fail_list);
2531 list_del(&dev->ibd_list);
2532
James Simmons06ace262016-02-12 12:06:08 -05002533 if (dev->ibd_hdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002534 kiblnd_hdev_decref(dev->ibd_hdev);
2535
2536 LIBCFS_FREE(dev, sizeof(*dev));
2537}
2538
James Simmons8d9de3f2016-06-10 16:13:39 -04002539static struct kib_dev *kiblnd_create_dev(char *ifname)
Peng Taod7e09d02013-05-02 16:46:55 +08002540{
2541 struct net_device *netdev;
James Simmons8d9de3f2016-06-10 16:13:39 -04002542 struct kib_dev *dev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002543 __u32 netmask;
2544 __u32 ip;
2545 int up;
2546 int rc;
Peng Taod7e09d02013-05-02 16:46:55 +08002547
James Simmons1ad6a732015-06-08 22:27:10 -04002548 rc = lnet_ipif_query(ifname, &up, &ip, &netmask);
James Simmons5fd88332016-02-12 12:06:09 -05002549 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002550 CERROR("Can't query IPoIB interface %s: %d\n",
2551 ifname, rc);
2552 return NULL;
2553 }
2554
2555 if (!up) {
2556 CERROR("Can't query IPoIB interface %s: it's down\n", ifname);
2557 return NULL;
2558 }
2559
2560 LIBCFS_ALLOC(dev, sizeof(*dev));
James Simmons06ace262016-02-12 12:06:08 -05002561 if (!dev)
Peng Taod7e09d02013-05-02 16:46:55 +08002562 return NULL;
2563
Peng Taod7e09d02013-05-02 16:46:55 +08002564 netdev = dev_get_by_name(&init_net, ifname);
James Simmons06ace262016-02-12 12:06:08 -05002565 if (!netdev) {
Peng Taod7e09d02013-05-02 16:46:55 +08002566 dev->ibd_can_failover = 0;
2567 } else {
2568 dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER);
2569 dev_put(netdev);
2570 }
2571
2572 INIT_LIST_HEAD(&dev->ibd_nets);
2573 INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */
2574 INIT_LIST_HEAD(&dev->ibd_fail_list);
2575 dev->ibd_ifip = ip;
2576 strcpy(&dev->ibd_ifname[0], ifname);
2577
2578 /* initialize the device */
2579 rc = kiblnd_dev_failover(dev);
James Simmons5fd88332016-02-12 12:06:09 -05002580 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002581 CERROR("Can't initialize device: %d\n", rc);
2582 LIBCFS_FREE(dev, sizeof(*dev));
2583 return NULL;
2584 }
2585
James Simmonsc314c312016-02-12 12:06:01 -05002586 list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs);
Peng Taod7e09d02013-05-02 16:46:55 +08002587 return dev;
2588}
2589
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002590static void kiblnd_base_shutdown(void)
Peng Taod7e09d02013-05-02 16:46:55 +08002591{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002592 struct kib_sched_info *sched;
2593 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002594
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002595 LASSERT(list_empty(&kiblnd_data.kib_devs));
Peng Taod7e09d02013-05-02 16:46:55 +08002596
Peng Taod7e09d02013-05-02 16:46:55 +08002597 switch (kiblnd_data.kib_init) {
2598 default:
2599 LBUG();
2600
2601 case IBLND_INIT_ALL:
2602 case IBLND_INIT_DATA:
James Simmons06ace262016-02-12 12:06:08 -05002603 LASSERT(kiblnd_data.kib_peers);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002604 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002605 LASSERT(list_empty(&kiblnd_data.kib_peers[i]));
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002606 LASSERT(list_empty(&kiblnd_data.kib_connd_zombies));
2607 LASSERT(list_empty(&kiblnd_data.kib_connd_conns));
Liang Zhen4d99b252016-03-02 18:53:29 -05002608 LASSERT(list_empty(&kiblnd_data.kib_reconn_list));
2609 LASSERT(list_empty(&kiblnd_data.kib_reconn_wait));
Peng Taod7e09d02013-05-02 16:46:55 +08002610
2611 /* flag threads to terminate; wake and wait for them to die */
2612 kiblnd_data.kib_shutdown = 1;
2613
James Simmons4420cfd2016-02-12 12:06:00 -05002614 /*
2615 * NB: we really want to stop scheduler threads net by net
Peng Taod7e09d02013-05-02 16:46:55 +08002616 * instead of the whole module, this should be improved
James Simmons4420cfd2016-02-12 12:06:00 -05002617 * with dynamic configuration LNet
2618 */
Peng Taod7e09d02013-05-02 16:46:55 +08002619 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds)
2620 wake_up_all(&sched->ibs_waitq);
2621
2622 wake_up_all(&kiblnd_data.kib_connd_waitq);
2623 wake_up_all(&kiblnd_data.kib_failover_waitq);
2624
2625 i = 2;
James Simmons5fd88332016-02-12 12:06:09 -05002626 while (atomic_read(&kiblnd_data.kib_nthreads)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002627 i++;
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002628 /* power of 2 ? */
2629 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
Peng Taod7e09d02013-05-02 16:46:55 +08002630 "Waiting for %d threads to terminate\n",
2631 atomic_read(&kiblnd_data.kib_nthreads));
Peng Taod3caf4d2014-03-18 21:05:56 +08002632 set_current_state(TASK_UNINTERRUPTIBLE);
2633 schedule_timeout(cfs_time_seconds(1));
Peng Taod7e09d02013-05-02 16:46:55 +08002634 }
2635
2636 /* fall through */
2637
2638 case IBLND_INIT_NOTHING:
2639 break;
2640 }
2641
James Simmons06ace262016-02-12 12:06:08 -05002642 if (kiblnd_data.kib_peers) {
Peng Taod7e09d02013-05-02 16:46:55 +08002643 LIBCFS_FREE(kiblnd_data.kib_peers,
2644 sizeof(struct list_head) *
2645 kiblnd_data.kib_peer_hash_size);
2646 }
2647
James Simmons06ace262016-02-12 12:06:08 -05002648 if (kiblnd_data.kib_scheds)
Peng Taod7e09d02013-05-02 16:46:55 +08002649 cfs_percpt_free(kiblnd_data.kib_scheds);
2650
Peng Taod7e09d02013-05-02 16:46:55 +08002651 kiblnd_data.kib_init = IBLND_INIT_NOTHING;
2652 module_put(THIS_MODULE);
2653}
2654
Frank Zago439b4d42016-03-02 17:02:00 -05002655static void kiblnd_shutdown(lnet_ni_t *ni)
Peng Taod7e09d02013-05-02 16:46:55 +08002656{
James Simmons8d9de3f2016-06-10 16:13:39 -04002657 struct kib_net *net = ni->ni_data;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002658 rwlock_t *g_lock = &kiblnd_data.kib_global_lock;
2659 int i;
2660 unsigned long flags;
Peng Taod7e09d02013-05-02 16:46:55 +08002661
2662 LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
2663
James Simmons06ace262016-02-12 12:06:08 -05002664 if (!net)
Peng Taod7e09d02013-05-02 16:46:55 +08002665 goto out;
2666
Peng Taod7e09d02013-05-02 16:46:55 +08002667 write_lock_irqsave(g_lock, flags);
2668 net->ibn_shutdown = 1;
2669 write_unlock_irqrestore(g_lock, flags);
2670
2671 switch (net->ibn_init) {
2672 default:
2673 LBUG();
2674
2675 case IBLND_INIT_ALL:
2676 /* nuke all existing peers within this net */
2677 kiblnd_del_peer(ni, LNET_NID_ANY);
2678
2679 /* Wait for all peer state to clean up */
2680 i = 2;
James Simmons5fd88332016-02-12 12:06:09 -05002681 while (atomic_read(&net->ibn_npeers)) {
Peng Taod7e09d02013-05-02 16:46:55 +08002682 i++;
2683 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
2684 "%s: waiting for %d peers to disconnect\n",
2685 libcfs_nid2str(ni->ni_nid),
2686 atomic_read(&net->ibn_npeers));
Peng Taod3caf4d2014-03-18 21:05:56 +08002687 set_current_state(TASK_UNINTERRUPTIBLE);
2688 schedule_timeout(cfs_time_seconds(1));
Peng Taod7e09d02013-05-02 16:46:55 +08002689 }
2690
2691 kiblnd_net_fini_pools(net);
2692
2693 write_lock_irqsave(g_lock, flags);
2694 LASSERT(net->ibn_dev->ibd_nnets > 0);
2695 net->ibn_dev->ibd_nnets--;
2696 list_del(&net->ibn_list);
2697 write_unlock_irqrestore(g_lock, flags);
2698
2699 /* fall through */
2700
2701 case IBLND_INIT_NOTHING:
James Simmons5fd88332016-02-12 12:06:09 -05002702 LASSERT(!atomic_read(&net->ibn_nconns));
Peng Taod7e09d02013-05-02 16:46:55 +08002703
James Simmons5fd88332016-02-12 12:06:09 -05002704 if (net->ibn_dev && !net->ibn_dev->ibd_nnets)
Peng Taod7e09d02013-05-02 16:46:55 +08002705 kiblnd_destroy_dev(net->ibn_dev);
2706
2707 break;
2708 }
2709
Peng Taod7e09d02013-05-02 16:46:55 +08002710 net->ibn_init = IBLND_INIT_NOTHING;
2711 ni->ni_data = NULL;
2712
2713 LIBCFS_FREE(net, sizeof(*net));
2714
2715out:
2716 if (list_empty(&kiblnd_data.kib_devs))
2717 kiblnd_base_shutdown();
Peng Taod7e09d02013-05-02 16:46:55 +08002718}
2719
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002720static int kiblnd_base_startup(void)
Peng Taod7e09d02013-05-02 16:46:55 +08002721{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002722 struct kib_sched_info *sched;
2723 int rc;
2724 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002725
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002726 LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING);
Peng Taod7e09d02013-05-02 16:46:55 +08002727
2728 try_module_get(THIS_MODULE);
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002729 /* zero pointers, flags etc */
2730 memset(&kiblnd_data, 0, sizeof(kiblnd_data));
Peng Taod7e09d02013-05-02 16:46:55 +08002731
2732 rwlock_init(&kiblnd_data.kib_global_lock);
2733
2734 INIT_LIST_HEAD(&kiblnd_data.kib_devs);
2735 INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs);
2736
2737 kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
2738 LIBCFS_ALLOC(kiblnd_data.kib_peers,
Mike Shueyec3d17c2015-05-19 10:14:36 -04002739 sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size);
James Simmons06ace262016-02-12 12:06:08 -05002740 if (!kiblnd_data.kib_peers)
Peng Taod7e09d02013-05-02 16:46:55 +08002741 goto failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002742 for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
2743 INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
2744
2745 spin_lock_init(&kiblnd_data.kib_connd_lock);
2746 INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
2747 INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
Liang Zhen4d99b252016-03-02 18:53:29 -05002748 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list);
2749 INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait);
2750
Peng Taod7e09d02013-05-02 16:46:55 +08002751 init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
2752 init_waitqueue_head(&kiblnd_data.kib_failover_waitq);
2753
2754 kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(),
2755 sizeof(*sched));
James Simmons06ace262016-02-12 12:06:08 -05002756 if (!kiblnd_data.kib_scheds)
Peng Taod7e09d02013-05-02 16:46:55 +08002757 goto failed;
2758
2759 cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002760 int nthrs;
Peng Taod7e09d02013-05-02 16:46:55 +08002761
2762 spin_lock_init(&sched->ibs_lock);
2763 INIT_LIST_HEAD(&sched->ibs_conns);
2764 init_waitqueue_head(&sched->ibs_waitq);
2765
2766 nthrs = cfs_cpt_weight(lnet_cpt_table(), i);
2767 if (*kiblnd_tunables.kib_nscheds > 0) {
2768 nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds);
2769 } else {
James Simmons4420cfd2016-02-12 12:06:00 -05002770 /*
2771 * max to half of CPUs, another half is reserved for
2772 * upper layer modules
2773 */
Peng Taod7e09d02013-05-02 16:46:55 +08002774 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2775 }
2776
2777 sched->ibs_nthreads_max = nthrs;
2778 sched->ibs_cpt = i;
2779 }
2780
2781 kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
2782
2783 /* lists/ptrs/locks initialised */
2784 kiblnd_data.kib_init = IBLND_INIT_DATA;
2785 /*****************************************************/
2786
2787 rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd");
James Simmons5fd88332016-02-12 12:06:09 -05002788 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002789 CERROR("Can't spawn o2iblnd connd: %d\n", rc);
2790 goto failed;
2791 }
2792
James Simmons5fd88332016-02-12 12:06:09 -05002793 if (*kiblnd_tunables.kib_dev_failover)
Peng Taod7e09d02013-05-02 16:46:55 +08002794 rc = kiblnd_thread_start(kiblnd_failover_thread, NULL,
2795 "kiblnd_failover");
2796
James Simmons5fd88332016-02-12 12:06:09 -05002797 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002798 CERROR("Can't spawn o2iblnd failover thread: %d\n", rc);
2799 goto failed;
2800 }
2801
2802 /* flag everything initialised */
2803 kiblnd_data.kib_init = IBLND_INIT_ALL;
2804 /*****************************************************/
2805
2806 return 0;
2807
2808 failed:
2809 kiblnd_base_shutdown();
2810 return -ENETDOWN;
2811}
2812
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002813static int kiblnd_start_schedulers(struct kib_sched_info *sched)
Peng Taod7e09d02013-05-02 16:46:55 +08002814{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002815 int rc = 0;
2816 int nthrs;
2817 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002818
James Simmons5fd88332016-02-12 12:06:09 -05002819 if (!sched->ibs_nthreads) {
Peng Taod7e09d02013-05-02 16:46:55 +08002820 if (*kiblnd_tunables.kib_nscheds > 0) {
2821 nthrs = sched->ibs_nthreads_max;
2822 } else {
2823 nthrs = cfs_cpt_weight(lnet_cpt_table(),
2824 sched->ibs_cpt);
2825 nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs);
2826 nthrs = min(IBLND_N_SCHED_HIGH, nthrs);
2827 }
2828 } else {
2829 LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max);
2830 /* increase one thread if there is new interface */
Haneen Mohammedb6ee3822015-03-13 20:48:53 +03002831 nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max;
Peng Taod7e09d02013-05-02 16:46:55 +08002832 }
2833
2834 for (i = 0; i < nthrs; i++) {
Mike Shueyec3d17c2015-05-19 10:14:36 -04002835 long id;
2836 char name[20];
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002837
Peng Taod7e09d02013-05-02 16:46:55 +08002838 id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i);
2839 snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
2840 KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
2841 rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name);
James Simmons5fd88332016-02-12 12:06:09 -05002842 if (!rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002843 continue;
2844
2845 CERROR("Can't spawn thread %d for scheduler[%d]: %d\n",
2846 sched->ibs_cpt, sched->ibs_nthreads + i, rc);
2847 break;
2848 }
2849
2850 sched->ibs_nthreads += i;
2851 return rc;
2852}
2853
James Simmons8d9de3f2016-06-10 16:13:39 -04002854static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts,
Guillaume Matheron7a3888a2015-04-02 19:52:07 +02002855 int ncpts)
Peng Taod7e09d02013-05-02 16:46:55 +08002856{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002857 int cpt;
2858 int rc;
2859 int i;
Peng Taod7e09d02013-05-02 16:46:55 +08002860
2861 for (i = 0; i < ncpts; i++) {
2862 struct kib_sched_info *sched;
2863
James Simmons06ace262016-02-12 12:06:08 -05002864 cpt = !cpts ? i : cpts[i];
Peng Taod7e09d02013-05-02 16:46:55 +08002865 sched = kiblnd_data.kib_scheds[cpt];
2866
2867 if (!newdev && sched->ibs_nthreads > 0)
2868 continue;
2869
2870 rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]);
James Simmons5fd88332016-02-12 12:06:09 -05002871 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002872 CERROR("Failed to start scheduler threads for %s\n",
2873 dev->ibd_ifname);
2874 return rc;
2875 }
2876 }
2877 return 0;
2878}
2879
James Simmons8d9de3f2016-06-10 16:13:39 -04002880static struct kib_dev *kiblnd_dev_search(char *ifname)
Peng Taod7e09d02013-05-02 16:46:55 +08002881{
James Simmons8d9de3f2016-06-10 16:13:39 -04002882 struct kib_dev *alias = NULL;
2883 struct kib_dev *dev;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002884 char *colon;
2885 char *colon2;
Peng Taod7e09d02013-05-02 16:46:55 +08002886
2887 colon = strchr(ifname, ':');
2888 list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
James Simmons5fd88332016-02-12 12:06:09 -05002889 if (!strcmp(&dev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002890 return dev;
2891
James Simmons06ace262016-02-12 12:06:08 -05002892 if (alias)
Peng Taod7e09d02013-05-02 16:46:55 +08002893 continue;
2894
2895 colon2 = strchr(dev->ibd_ifname, ':');
James Simmons06ace262016-02-12 12:06:08 -05002896 if (colon)
Peng Taod7e09d02013-05-02 16:46:55 +08002897 *colon = 0;
James Simmons06ace262016-02-12 12:06:08 -05002898 if (colon2)
Peng Taod7e09d02013-05-02 16:46:55 +08002899 *colon2 = 0;
2900
James Simmons5fd88332016-02-12 12:06:09 -05002901 if (!strcmp(&dev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002902 alias = dev;
2903
James Simmons06ace262016-02-12 12:06:08 -05002904 if (colon)
Peng Taod7e09d02013-05-02 16:46:55 +08002905 *colon = ':';
James Simmons06ace262016-02-12 12:06:08 -05002906 if (colon2)
Peng Taod7e09d02013-05-02 16:46:55 +08002907 *colon2 = ':';
2908 }
2909 return alias;
2910}
2911
Frank Zago439b4d42016-03-02 17:02:00 -05002912static int kiblnd_startup(lnet_ni_t *ni)
Peng Taod7e09d02013-05-02 16:46:55 +08002913{
Mike Shueyec3d17c2015-05-19 10:14:36 -04002914 char *ifname;
James Simmons8d9de3f2016-06-10 16:13:39 -04002915 struct kib_dev *ibdev = NULL;
2916 struct kib_net *net;
Arnd Bergmann473c4e02015-09-27 16:45:13 -04002917 struct timespec64 tv;
Mike Shueyec3d17c2015-05-19 10:14:36 -04002918 unsigned long flags;
2919 int rc;
2920 int newdev;
Peng Taod7e09d02013-05-02 16:46:55 +08002921
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002922 LASSERT(ni->ni_lnd == &the_o2iblnd);
Peng Taod7e09d02013-05-02 16:46:55 +08002923
2924 if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
2925 rc = kiblnd_base_startup();
James Simmons5fd88332016-02-12 12:06:09 -05002926 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002927 return rc;
2928 }
2929
2930 LIBCFS_ALLOC(net, sizeof(*net));
2931 ni->ni_data = net;
James Simmons06ace262016-02-12 12:06:08 -05002932 if (!net)
Jiayi Ye3247c4e2014-10-25 11:40:32 +08002933 goto net_failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002934
Arnd Bergmann473c4e02015-09-27 16:45:13 -04002935 ktime_get_real_ts64(&tv);
2936 net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
2937 tv.tv_nsec / NSEC_PER_USEC;
Peng Taod7e09d02013-05-02 16:46:55 +08002938
Amir Shehataf6e50062016-05-06 21:30:27 -04002939 rc = kiblnd_tunables_setup(ni);
Amir Shehata025ba822016-05-06 21:30:26 -04002940 if (rc)
2941 goto net_failed;
Peng Taod7e09d02013-05-02 16:46:55 +08002942
James Simmons06ace262016-02-12 12:06:08 -05002943 if (ni->ni_interfaces[0]) {
Peng Taod7e09d02013-05-02 16:46:55 +08002944 /* Use the IPoIB interface specified in 'networks=' */
2945
Guillaume Matheronfebe73b2015-04-02 19:35:45 +02002946 CLASSERT(LNET_MAX_INTERFACES > 1);
James Simmons06ace262016-02-12 12:06:08 -05002947 if (ni->ni_interfaces[1]) {
Peng Taod7e09d02013-05-02 16:46:55 +08002948 CERROR("Multiple interfaces not supported\n");
2949 goto failed;
2950 }
2951
2952 ifname = ni->ni_interfaces[0];
2953 } else {
2954 ifname = *kiblnd_tunables.kib_default_ipif;
2955 }
2956
2957 if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
2958 CERROR("IPoIB interface name too long: %s\n", ifname);
2959 goto failed;
2960 }
2961
2962 ibdev = kiblnd_dev_search(ifname);
2963
James Simmons06ace262016-02-12 12:06:08 -05002964 newdev = !ibdev;
Peng Taod7e09d02013-05-02 16:46:55 +08002965 /* hmm...create kib_dev even for alias */
James Simmons5fd88332016-02-12 12:06:09 -05002966 if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname))
Peng Taod7e09d02013-05-02 16:46:55 +08002967 ibdev = kiblnd_create_dev(ifname);
2968
James Simmons06ace262016-02-12 12:06:08 -05002969 if (!ibdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002970 goto failed;
2971
2972 net->ibn_dev = ibdev;
2973 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
2974
2975 rc = kiblnd_dev_start_threads(ibdev, newdev,
2976 ni->ni_cpts, ni->ni_ncpts);
James Simmons5fd88332016-02-12 12:06:09 -05002977 if (rc)
Peng Taod7e09d02013-05-02 16:46:55 +08002978 goto failed;
2979
Amir Shehata32c8deb82016-05-06 21:30:28 -04002980 rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
James Simmons5fd88332016-02-12 12:06:09 -05002981 if (rc) {
Peng Taod7e09d02013-05-02 16:46:55 +08002982 CERROR("Failed to initialize NI pools: %d\n", rc);
2983 goto failed;
2984 }
2985
2986 write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
2987 ibdev->ibd_nnets++;
2988 list_add_tail(&net->ibn_list, &ibdev->ibd_nets);
2989 write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
2990
2991 net->ibn_init = IBLND_INIT_ALL;
2992
2993 return 0;
2994
2995failed:
James Simmons06ace262016-02-12 12:06:08 -05002996 if (!net->ibn_dev && ibdev)
Peng Taod7e09d02013-05-02 16:46:55 +08002997 kiblnd_destroy_dev(ibdev);
2998
Jiayi Ye3247c4e2014-10-25 11:40:32 +08002999net_failed:
Peng Taod7e09d02013-05-02 16:46:55 +08003000 kiblnd_shutdown(ni);
3001
3002 CDEBUG(D_NET, "kiblnd_startup failed\n");
3003 return -ENETDOWN;
3004}
3005
Frank Zago439b4d42016-03-02 17:02:00 -05003006static lnd_t the_o2iblnd = {
3007 .lnd_type = O2IBLND,
3008 .lnd_startup = kiblnd_startup,
3009 .lnd_shutdown = kiblnd_shutdown,
3010 .lnd_ctl = kiblnd_ctl,
3011 .lnd_query = kiblnd_query,
3012 .lnd_send = kiblnd_send,
3013 .lnd_recv = kiblnd_recv,
3014};
3015
Andreas Dilgere0f94112016-02-26 11:36:05 -05003016static void __exit ko2iblnd_exit(void)
Peng Taod7e09d02013-05-02 16:46:55 +08003017{
3018 lnet_unregister_lnd(&the_o2iblnd);
Peng Taod7e09d02013-05-02 16:46:55 +08003019}
3020
Andreas Dilgere0f94112016-02-26 11:36:05 -05003021static int __init ko2iblnd_init(void)
Peng Taod7e09d02013-05-02 16:46:55 +08003022{
James Simmons8d9de3f2016-06-10 16:13:39 -04003023 CLASSERT(sizeof(struct kib_msg) <= IBLND_MSG_SIZE);
3024 CLASSERT(offsetof(struct kib_msg,
James Simmonsc314c312016-02-12 12:06:01 -05003025 ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3026 <= IBLND_MSG_SIZE);
James Simmons8d9de3f2016-06-10 16:13:39 -04003027 CLASSERT(offsetof(struct kib_msg,
James Simmonsc314c312016-02-12 12:06:01 -05003028 ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
3029 <= IBLND_MSG_SIZE);
Peng Taod7e09d02013-05-02 16:46:55 +08003030
Amir Shehata025ba822016-05-06 21:30:26 -04003031 kiblnd_tunables_init();
Peng Taod7e09d02013-05-02 16:46:55 +08003032
3033 lnet_register_lnd(&the_o2iblnd);
3034
3035 return 0;
3036}
3037
James Simmonsa0455472015-11-04 13:40:02 -05003038MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
Andreas Dilger57878e12016-02-26 11:36:04 -05003039MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver");
James Simmons5b0e50b2016-02-26 11:36:03 -05003040MODULE_VERSION("2.7.0");
Peng Taod7e09d02013-05-02 16:46:55 +08003041MODULE_LICENSE("GPL");
3042
Andreas Dilgere0f94112016-02-26 11:36:05 -05003043module_init(ko2iblnd_init);
3044module_exit(ko2iblnd_exit);