RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net
Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use that struct net pointer when verifying the
address passed to rds_bind().
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@
ret = 0;
goto out;
}
- trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+ trans = rds_trans_get_preferred(sock_net(sock->sk),
+ sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@
* For now they are not garbage collected once they're created. They
* are torn down as the module is removed, if ever.
*/
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
@@ -157,6 +158,7 @@
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
+ rds_conn_net_set(conn, net);
init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
@@ -174,7 +176,7 @@
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -260,17 +262,19 @@
return conn;
}
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
- return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
- GFP_KERNEL);
+ /* RDS/IB is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 5899356..5d5a9d2 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -218,7 +218,7 @@
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_iw_laddr_check(__be32 addr)
+static int rds_iw_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index 8f486fa..a6553a6 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -398,8 +398,9 @@
&dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
- conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
- GFP_KERNEL);
+ /* RDS/IW is not currently netns aware, thus init_net */
+ conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
+ &rds_iw_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 2260c1e4..9005fb0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -128,8 +128,21 @@
/* Protocol version */
unsigned int c_version;
+ possible_net_t c_net;
};
+static inline
+struct net *rds_conn_net(struct rds_connection *conn)
+{
+ return read_pnet(&conn->c_net);
+}
+
+static inline
+void rds_conn_net_set(struct rds_connection *conn, struct net *net)
+{
+ write_pnet(&conn->c_net, net);
+}
+
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
@@ -417,7 +430,7 @@
unsigned int t_prefer_loopback:1;
unsigned int t_type;
- int (*laddr_check)(__be32 addr);
+ int (*laddr_check)(struct net *net, __be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn);
@@ -608,9 +621,11 @@
/* conn.c */
int rds_conn_init(void);
void rds_conn_exit(void);
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+ __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
@@ -795,7 +810,7 @@
/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(__be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
diff --git a/net/rds/send.c b/net/rds/send.c
index e9430f5..2581b8e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1023,7 +1023,8 @@
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
conn = rs->rs_conn;
else {
- conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
+ conn = rds_conn_create_outgoing(sock_net(sock->sk),
+ rs->rs_bound_addr, daddr,
rs->rs_transport,
sock->sk->sk_allocation);
if (IS_ERR(conn)) {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index edac9ef..98f5de3 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -189,9 +189,9 @@
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
-static int rds_tcp_laddr_check(__be32 addr)
+static int rds_tcp_laddr_check(struct net *net, __be32 addr)
{
- if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
+ if (inet_addr_type(net, addr) == RTN_LOCAL)
return 0;
return -EADDRNOTAVAIL;
}
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 973109c7..6473b7b 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -79,7 +79,8 @@
struct sockaddr_in src, dest;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0da49e3..398ffe5 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -85,8 +85,9 @@
struct inet_sock *inet;
struct rds_tcp_connection *rs_tcp;
- ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
- sock->sk->sk_protocol, &new_sock);
+ ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
+ sock->sk->sk_type, sock->sk->sk_protocol,
+ &new_sock);
if (ret)
goto out;
@@ -108,7 +109,8 @@
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport));
- conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
+ conn = rds_conn_create(sock_net(sock->sk),
+ inet->inet_saddr, inet->inet_daddr,
&rds_tcp_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
@@ -187,7 +189,13 @@
struct socket *sock = NULL;
int ret;
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ /* MUST call sock_create_kern directly so that we avoid get_net()
+ * in sk_alloc(). Doing a get_net() will result in cleanup_net()
+ * never getting invoked, which will leave sock and other things
+ * in limbo.
+ */
+ ret = sock_create_kern(current->nsproxy->net_ns, PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 83498e1..f3afd1d 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -77,7 +77,7 @@
module_put(trans->t_owner);
}
-struct rds_transport *rds_trans_get_preferred(__be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
@@ -90,7 +90,7 @@
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
- if (trans && (trans->laddr_check(addr) == 0) &&
+ if (trans && (trans->laddr_check(net, addr) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;