blob: 0eed4c154081af42fe96c8a456e10a944ce8a596 [file] [log] [blame]
Ursula Braun0cfdd8f2017-01-09 16:55:17 +01001/*
2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
3 *
4 * Basic Transport Functions exploiting Infiniband API
5 *
6 * Copyright IBM Corp. 2016
7 *
8 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
9 */
10
11#include <linux/socket.h>
12#include <linux/if_vlan.h>
13#include <linux/random.h>
14#include <linux/workqueue.h>
15#include <net/tcp.h>
16#include <net/sock.h>
17#include <rdma/ib_verbs.h>
18
19#include "smc.h"
20#include "smc_clc.h"
21#include "smc_core.h"
22#include "smc_ib.h"
Ursula Braunf38ba1792017-01-09 16:55:19 +010023#include "smc_wr.h"
Ursula Braun0cfdd8f2017-01-09 16:55:17 +010024
25#define SMC_LGR_FREE_DELAY (600 * HZ)
26
27/* Register connection's alert token in our lookup structure.
28 * To use rbtrees we have to implement our own insert core.
29 * Requires @conns_lock
30 * @smc connection to register
31 * Returns 0 on success, != otherwise.
32 */
33static void smc_lgr_add_alert_token(struct smc_connection *conn)
34{
35 struct rb_node **link, *parent = NULL;
36 u32 token = conn->alert_token_local;
37
38 link = &conn->lgr->conns_all.rb_node;
39 while (*link) {
40 struct smc_connection *cur = rb_entry(*link,
41 struct smc_connection, alert_node);
42
43 parent = *link;
44 if (cur->alert_token_local > token)
45 link = &parent->rb_left;
46 else
47 link = &parent->rb_right;
48 }
49 /* Put the new node there */
50 rb_link_node(&conn->alert_node, parent, link);
51 rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
52}
53
54/* Register connection in link group by assigning an alert token
55 * registered in a search tree.
56 * Requires @conns_lock
57 * Note that '0' is a reserved value and not assigned.
58 */
59static void smc_lgr_register_conn(struct smc_connection *conn)
60{
61 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
62 static atomic_t nexttoken = ATOMIC_INIT(0);
63
64 /* find a new alert_token_local value not yet used by some connection
65 * in this link group
66 */
67 sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
68 while (!conn->alert_token_local) {
69 conn->alert_token_local = atomic_inc_return(&nexttoken);
70 if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
71 conn->alert_token_local = 0;
72 }
73 smc_lgr_add_alert_token(conn);
74 conn->lgr->conns_num++;
75}
76
77/* Unregister connection and reset the alert token of the given connection<
78 */
79static void __smc_lgr_unregister_conn(struct smc_connection *conn)
80{
81 struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
82 struct smc_link_group *lgr = conn->lgr;
83
84 rb_erase(&conn->alert_node, &lgr->conns_all);
85 lgr->conns_num--;
86 conn->alert_token_local = 0;
87 conn->lgr = NULL;
88 sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
89}
90
91/* Unregister connection and trigger lgr freeing if applicable
92 */
93static void smc_lgr_unregister_conn(struct smc_connection *conn)
94{
95 struct smc_link_group *lgr = conn->lgr;
96 int reduced = 0;
97
98 write_lock_bh(&lgr->conns_lock);
99 if (conn->alert_token_local) {
100 reduced = 1;
101 __smc_lgr_unregister_conn(conn);
102 }
103 write_unlock_bh(&lgr->conns_lock);
104 if (reduced && !lgr->conns_num)
105 schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
106}
107
108static void smc_lgr_free_work(struct work_struct *work)
109{
110 struct smc_link_group *lgr = container_of(to_delayed_work(work),
111 struct smc_link_group,
112 free_work);
113 bool conns;
114
115 spin_lock_bh(&smc_lgr_list.lock);
116 read_lock_bh(&lgr->conns_lock);
117 conns = RB_EMPTY_ROOT(&lgr->conns_all);
118 read_unlock_bh(&lgr->conns_lock);
119 if (!conns) { /* number of lgr connections is no longer zero */
120 spin_unlock_bh(&smc_lgr_list.lock);
121 return;
122 }
123 list_del_init(&lgr->list); /* remove from smc_lgr_list */
124 spin_unlock_bh(&smc_lgr_list.lock);
125 smc_lgr_free(lgr);
126}
127
128/* create a new SMC link group */
129static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
130 struct smc_ib_device *smcibdev, u8 ibport,
131 char *peer_systemid, unsigned short vlan_id)
132{
133 struct smc_link_group *lgr;
134 struct smc_link *lnk;
135 u8 rndvec[3];
136 int rc = 0;
Ursula Brauncd6851f2017-01-09 16:55:18 +0100137 int i;
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100138
139 lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
140 if (!lgr) {
141 rc = -ENOMEM;
142 goto out;
143 }
144 lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
145 lgr->sync_err = false;
146 lgr->daddr = peer_in_addr;
147 memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
148 lgr->vlan_id = vlan_id;
Ursula Brauncd6851f2017-01-09 16:55:18 +0100149 rwlock_init(&lgr->sndbufs_lock);
150 rwlock_init(&lgr->rmbs_lock);
151 for (i = 0; i < SMC_RMBE_SIZES; i++) {
152 INIT_LIST_HEAD(&lgr->sndbufs[i]);
153 INIT_LIST_HEAD(&lgr->rmbs[i]);
154 }
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100155 INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
156 lgr->conns_all = RB_ROOT;
157
158 lnk = &lgr->lnk[SMC_SINGLE_LINK];
159 /* initialize link */
160 lnk->smcibdev = smcibdev;
161 lnk->ibport = ibport;
162 lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
163 get_random_bytes(rndvec, sizeof(rndvec));
164 lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
Ursula Braunf38ba1792017-01-09 16:55:19 +0100165 rc = smc_wr_alloc_link_mem(lnk);
166 if (rc)
167 goto free_lgr;
168 init_waitqueue_head(&lnk->wr_tx_wait);
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100169
170 smc->conn.lgr = lgr;
171 rwlock_init(&lgr->conns_lock);
172 spin_lock_bh(&smc_lgr_list.lock);
173 list_add(&lgr->list, &smc_lgr_list.list);
174 spin_unlock_bh(&smc_lgr_list.lock);
Ursula Braunf38ba1792017-01-09 16:55:19 +0100175 return 0;
176
177free_lgr:
178 kfree(lgr);
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100179out:
180 return rc;
181}
182
Ursula Brauncd6851f2017-01-09 16:55:18 +0100183static void smc_sndbuf_unuse(struct smc_connection *conn)
184{
185 if (conn->sndbuf_desc) {
186 conn->sndbuf_desc->used = 0;
187 conn->sndbuf_size = 0;
188 }
189}
190
191static void smc_rmb_unuse(struct smc_connection *conn)
192{
193 if (conn->rmb_desc) {
194 conn->rmb_desc->used = 0;
195 conn->rmbe_size = 0;
196 }
197}
198
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100199/* remove a finished connection from its link group */
200void smc_conn_free(struct smc_connection *conn)
201{
202 struct smc_link_group *lgr = conn->lgr;
203
204 if (!lgr)
205 return;
206 smc_lgr_unregister_conn(conn);
Ursula Brauncd6851f2017-01-09 16:55:18 +0100207 smc_rmb_unuse(conn);
208 smc_sndbuf_unuse(conn);
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100209}
210
211static void smc_link_clear(struct smc_link *lnk)
212{
213 lnk->peer_qpn = 0;
Ursula Braunf38ba1792017-01-09 16:55:19 +0100214 smc_wr_free_link(lnk);
215 smc_wr_free_link_mem(lnk);
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100216}
217
Ursula Brauncd6851f2017-01-09 16:55:18 +0100218static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
219{
220 struct smc_buf_desc *sndbuf_desc, *bf_desc;
221 int i;
222
223 for (i = 0; i < SMC_RMBE_SIZES; i++) {
224 list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
225 list) {
226 kfree(sndbuf_desc->cpu_addr);
227 kfree(sndbuf_desc);
228 }
229 }
230}
231
232static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
233{
234 struct smc_buf_desc *rmb_desc, *bf_desc;
235 int i;
236
237 for (i = 0; i < SMC_RMBE_SIZES; i++) {
238 list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
239 list) {
240 kfree(rmb_desc->cpu_addr);
241 kfree(rmb_desc);
242 }
243 }
244}
245
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100246/* remove a link group */
247void smc_lgr_free(struct smc_link_group *lgr)
248{
Ursula Brauncd6851f2017-01-09 16:55:18 +0100249 smc_lgr_free_rmbs(lgr);
250 smc_lgr_free_sndbufs(lgr);
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100251 smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
252 kfree(lgr);
253}
254
255/* terminate linkgroup abnormally */
256void smc_lgr_terminate(struct smc_link_group *lgr)
257{
258 struct smc_connection *conn;
259 struct rb_node *node;
260
261 spin_lock_bh(&smc_lgr_list.lock);
262 if (list_empty(&lgr->list)) {
263 /* termination already triggered */
264 spin_unlock_bh(&smc_lgr_list.lock);
265 return;
266 }
267 /* do not use this link group for new connections */
268 list_del_init(&lgr->list);
269 spin_unlock_bh(&smc_lgr_list.lock);
270
271 write_lock_bh(&lgr->conns_lock);
272 node = rb_first(&lgr->conns_all);
273 while (node) {
274 conn = rb_entry(node, struct smc_connection, alert_node);
275 __smc_lgr_unregister_conn(conn);
276 node = rb_first(&lgr->conns_all);
277 }
278 write_unlock_bh(&lgr->conns_lock);
279 schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
280}
281
282/* Determine vlan of internal TCP socket.
283 * @vlan_id: address to store the determined vlan id into
284 */
285static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
286{
287 struct dst_entry *dst = sk_dst_get(clcsock->sk);
288 int rc = 0;
289
290 *vlan_id = 0;
291 if (!dst) {
292 rc = -ENOTCONN;
293 goto out;
294 }
295 if (!dst->dev) {
296 rc = -ENODEV;
297 goto out_rel;
298 }
299
300 if (is_vlan_dev(dst->dev))
301 *vlan_id = vlan_dev_vlan_id(dst->dev);
302
303out_rel:
304 dst_release(dst);
305out:
306 return rc;
307}
308
309/* determine the link gid matching the vlan id of the link group */
310static int smc_link_determine_gid(struct smc_link_group *lgr)
311{
312 struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
313 struct ib_gid_attr gattr;
314 union ib_gid gid;
315 int i;
316
317 if (!lgr->vlan_id) {
318 lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
319 return 0;
320 }
321
322 for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
323 i++) {
324 if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
325 &gattr))
326 continue;
327 if (gattr.ndev &&
328 (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
329 lnk->gid = gid;
330 return 0;
331 }
332 }
333 return -ENODEV;
334}
335
336/* create a new SMC connection (and a new link group if necessary) */
337int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
338 struct smc_ib_device *smcibdev, u8 ibport,
339 struct smc_clc_msg_local *lcl, int srv_first_contact)
340{
341 struct smc_connection *conn = &smc->conn;
342 struct smc_link_group *lgr;
343 unsigned short vlan_id;
344 enum smc_lgr_role role;
345 int local_contact = SMC_FIRST_CONTACT;
346 int rc = 0;
347
348 role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
349 rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
350 if (rc)
351 return rc;
352
353 if ((role == SMC_CLNT) && srv_first_contact)
354 /* create new link group as well */
355 goto create;
356
357 /* determine if an existing link group can be reused */
358 spin_lock_bh(&smc_lgr_list.lock);
359 list_for_each_entry(lgr, &smc_lgr_list.list, list) {
360 write_lock_bh(&lgr->conns_lock);
361 if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
362 SMC_SYSTEMID_LEN) &&
363 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
364 SMC_GID_SIZE) &&
365 !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
366 sizeof(lcl->mac)) &&
367 !lgr->sync_err &&
368 (lgr->role == role) &&
Ursula Brauncd6851f2017-01-09 16:55:18 +0100369 (lgr->vlan_id == vlan_id) &&
370 ((role == SMC_CLNT) ||
371 (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
Ursula Braun0cfdd8f2017-01-09 16:55:17 +0100372 /* link group found */
373 local_contact = SMC_REUSE_CONTACT;
374 conn->lgr = lgr;
375 smc_lgr_register_conn(conn); /* add smc conn to lgr */
376 write_unlock_bh(&lgr->conns_lock);
377 break;
378 }
379 write_unlock_bh(&lgr->conns_lock);
380 }
381 spin_unlock_bh(&smc_lgr_list.lock);
382
383 if (role == SMC_CLNT && !srv_first_contact &&
384 (local_contact == SMC_FIRST_CONTACT)) {
385 /* Server reuses a link group, but Client wants to start
386 * a new one
387 * send out_of_sync decline, reason synchr. error
388 */
389 return -ENOLINK;
390 }
391
392create:
393 if (local_contact == SMC_FIRST_CONTACT) {
394 rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
395 lcl->id_for_peer, vlan_id);
396 if (rc)
397 goto out;
398 smc_lgr_register_conn(conn); /* add smc conn to lgr */
399 rc = smc_link_determine_gid(conn->lgr);
400 }
401
402out:
403 return rc ? rc : local_contact;
404}
Ursula Brauncd6851f2017-01-09 16:55:18 +0100405
406/* try to reuse a sndbuf description slot of the sndbufs list for a certain
407 * buf_size; if not available, return NULL
408 */
409static inline
410struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
411 int compressed_bufsize)
412{
413 struct smc_buf_desc *sndbuf_slot;
414
415 read_lock_bh(&lgr->sndbufs_lock);
416 list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
417 list) {
418 if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
419 read_unlock_bh(&lgr->sndbufs_lock);
420 return sndbuf_slot;
421 }
422 }
423 read_unlock_bh(&lgr->sndbufs_lock);
424 return NULL;
425}
426
427/* try to reuse an rmb description slot of the rmbs list for a certain
428 * rmbe_size; if not available, return NULL
429 */
430static inline
431struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
432 int compressed_bufsize)
433{
434 struct smc_buf_desc *rmb_slot;
435
436 read_lock_bh(&lgr->rmbs_lock);
437 list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
438 list) {
439 if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
440 read_unlock_bh(&lgr->rmbs_lock);
441 return rmb_slot;
442 }
443 }
444 read_unlock_bh(&lgr->rmbs_lock);
445 return NULL;
446}
447
448/* create the tx buffer for an SMC socket */
449int smc_sndbuf_create(struct smc_sock *smc)
450{
451 struct smc_connection *conn = &smc->conn;
452 struct smc_link_group *lgr = conn->lgr;
453 int tmp_bufsize, tmp_bufsize_short;
454 struct smc_buf_desc *sndbuf_desc;
455 int rc;
456
457 /* use socket send buffer size (w/o overhead) as start value */
458 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
459 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
460 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
461 /* check for reusable sndbuf_slot in the link group */
462 sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
463 if (sndbuf_desc) {
464 memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
465 break; /* found reusable slot */
466 }
467 /* try to alloc a new send buffer */
468 sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
469 if (!sndbuf_desc)
470 break; /* give up with -ENOMEM */
471 sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
472 GFP_KERNEL | __GFP_NOWARN |
473 __GFP_NOMEMALLOC |
474 __GFP_NORETRY);
475 if (!sndbuf_desc->cpu_addr) {
476 kfree(sndbuf_desc);
477 /* if send buffer allocation has failed,
478 * try a smaller one
479 */
480 continue;
481 }
482 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
483 tmp_bufsize, sndbuf_desc,
484 DMA_TO_DEVICE);
485 if (rc) {
486 kfree(sndbuf_desc->cpu_addr);
487 kfree(sndbuf_desc);
488 continue; /* if mapping failed, try smaller one */
489 }
490 sndbuf_desc->used = 1;
491 write_lock_bh(&lgr->sndbufs_lock);
492 list_add(&sndbuf_desc->list,
493 &lgr->sndbufs[tmp_bufsize_short]);
494 write_unlock_bh(&lgr->sndbufs_lock);
495 break;
496 }
497 if (sndbuf_desc && sndbuf_desc->cpu_addr) {
498 conn->sndbuf_desc = sndbuf_desc;
499 conn->sndbuf_size = tmp_bufsize;
500 smc->sk.sk_sndbuf = tmp_bufsize * 2;
501 return 0;
502 } else {
503 return -ENOMEM;
504 }
505}
506
507/* create the RMB for an SMC socket (even though the SMC protocol
508 * allows more than one RMB-element per RMB, the Linux implementation
509 * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
510 * connection in a link group
511 */
512int smc_rmb_create(struct smc_sock *smc)
513{
514 struct smc_connection *conn = &smc->conn;
515 struct smc_link_group *lgr = conn->lgr;
516 int tmp_bufsize, tmp_bufsize_short;
517 struct smc_buf_desc *rmb_desc;
518 int rc;
519
520 /* use socket recv buffer size (w/o overhead) as start value */
521 for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
522 tmp_bufsize_short >= 0; tmp_bufsize_short--) {
523 tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
524 /* check for reusable rmb_slot in the link group */
525 rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
526 if (rmb_desc) {
527 memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
528 break; /* found reusable slot */
529 }
530 /* try to alloc a new RMB */
531 rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
532 if (!rmb_desc)
533 break; /* give up with -ENOMEM */
534 rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
535 GFP_KERNEL | __GFP_NOWARN |
536 __GFP_NOMEMALLOC |
537 __GFP_NORETRY);
538 if (!rmb_desc->cpu_addr) {
539 kfree(rmb_desc);
540 /* if RMB allocation has failed,
541 * try a smaller one
542 */
543 continue;
544 }
545 rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
546 tmp_bufsize, rmb_desc,
547 DMA_FROM_DEVICE);
548 if (rc) {
549 kfree(rmb_desc->cpu_addr);
550 kfree(rmb_desc);
551 continue; /* if mapping failed, try smaller one */
552 }
553 rmb_desc->used = 1;
554 write_lock_bh(&lgr->rmbs_lock);
555 list_add(&rmb_desc->list,
556 &lgr->rmbs[tmp_bufsize_short]);
557 write_unlock_bh(&lgr->rmbs_lock);
558 break;
559 }
560 if (rmb_desc && rmb_desc->cpu_addr) {
561 conn->rmb_desc = rmb_desc;
562 conn->rmbe_size = tmp_bufsize;
563 conn->rmbe_size_short = tmp_bufsize_short;
564 smc->sk.sk_rcvbuf = tmp_bufsize * 2;
565 return 0;
566 } else {
567 return -ENOMEM;
568 }
569}