Merge branch 'listener_refactor_part_11'
Eric Dumazet says:
====================
inet: tcp listener refactoring, part 11
Before inserting request sockets into general (ehash) table,
we need to prepare netfilter to cope with them, as they are
not full sockets.
I'll later change xt_socket to get full support, including for
request sockets (NEW_SYN_RECV)
Save 8 bytes in inet_request_sock on 64bit arches. We'll soon add
a pointer to the listener socket.
I included two TCP changes in this patch series.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 3d8c09a..c9ed918 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -94,11 +94,11 @@
acked : 1,
no_srccheck: 1;
kmemcheck_bitfield_end(flags);
+ u32 ir_mark;
union {
struct ip_options_rcu *opt;
struct sk_buff *pktopts;
};
- u32 ir_mark;
};
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
@@ -106,13 +106,12 @@
return (struct inet_request_sock *)sk;
}
-static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb)
+static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
- if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) {
+ if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
return skb->mark;
- } else {
- return sk->sk_mark;
- }
+
+ return sk->sk_mark;
}
struct inet_cork {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2e11e38..5b29835 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1137,31 +1137,6 @@
return tcp_win_from_space(sk->sk_rcvbuf);
}
-static inline void tcp_openreq_init(struct request_sock *req,
- struct tcp_options_received *rx_opt,
- struct sk_buff *skb, struct sock *sk)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
-
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
- req->cookie_ts = 0;
- tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
- tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = tcp_time_stamp;
- tcp_rsk(req)->last_oow_ack_time = 0;
- req->mss = rx_opt->mss_clamp;
- req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
- ireq->tstamp_ok = rx_opt->tstamp_ok;
- ireq->sack_ok = rx_opt->sack_ok;
- ireq->snd_wscale = rx_opt->snd_wscale;
- ireq->wscale_ok = rx_opt->wscale_ok;
- ireq->acked = 0;
- ireq->ecn_ok = 0;
- ireq->ir_rmt_port = tcp_hdr(skb)->source;
- ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
- ireq->ir_mark = inet_request_mark(sk, skb);
-}
-
extern void tcp_openreq_init_rwin(struct request_sock *req,
struct sock *sk, struct dst_entry *dst);
@@ -1241,36 +1216,8 @@
return true;
}
-/* Return true if we're currently rate-limiting out-of-window ACKs and
- * thus shouldn't send a dupack right now. We rate-limit dupacks in
- * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
- * attacks that send repeated SYNs or ACKs for the same connection. To
- * do this, we do not send a duplicate SYNACK or ACK if the remote
- * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
- */
-static inline bool tcp_oow_rate_limited(struct net *net,
- const struct sk_buff *skb,
- int mib_idx, u32 *last_oow_ack_time)
-{
- /* Data packets without SYNs are not likely part of an ACK loop. */
- if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
- !tcp_hdr(skb)->syn)
- goto not_rate_limited;
-
- if (*last_oow_ack_time) {
- s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
-
- if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
- NET_INC_STATS_BH(net, mib_idx);
- return true; /* rate-limited: don't send yet! */
- }
- }
-
- *last_oow_ack_time = tcp_time_stamp;
-
-not_rate_limited:
- return false; /* not rate-limited: go ahead, send dupack now! */
-}
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time);
static inline void tcp_mib_init(struct net *net)
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 717d437..7257eb2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3321,6 +3321,36 @@
return flag;
}
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* RFC 5961 7 [ACK Throttling] */
static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
@@ -5912,6 +5942,31 @@
inet_rsk(req)->ecn_ok = 1;
}
+static void tcp_openreq_init(struct request_sock *req,
+ const struct tcp_options_received *rx_opt,
+ struct sk_buff *skb, const struct sock *sk)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->cookie_ts = 0;
+ tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
+ tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
+ req->mss = rx_opt->mss_clamp;
+ req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
+ ireq->tstamp_ok = rx_opt->tstamp_ok;
+ ireq->sack_ok = rx_opt->sack_ok;
+ ireq->snd_wscale = rx_opt->snd_wscale;
+ ireq->wscale_ok = rx_opt->wscale_ok;
+ ireq->acked = 0;
+ ireq->ecn_ok = 0;
+ ireq->ir_rmt_port = tcp_hdr(skb)->source;
+ ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+ ireq->ir_mark = inet_request_mark(sk, skb);
+}
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3aedbda..f35c15b 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -209,7 +209,7 @@
struct sock *sk = skb->sk;
struct rtable *ort = skb_rtable(skb);
- if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ if (!skb->dev && sk && sk_fullsock(sk))
ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
index a2233e7..2631876 100644
--- a/net/netfilter/nf_log_common.c
+++ b/net/netfilter/nf_log_common.c
@@ -133,7 +133,7 @@
void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
{
- if (!sk || sk->sk_state == TCP_TIME_WAIT)
+ if (!sk || !sk_fullsock(sk))
return;
read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 11d85b3..61d04bf 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -539,7 +539,7 @@
/* UID */
sk = skb->sk;
- if (sk && sk->sk_state != TCP_TIME_WAIT) {
+ if (sk && sk_fullsock(sk)) {
read_lock_bh(&sk->sk_callback_lock);
if (sk->sk_socket && sk->sk_socket->file) {
struct file *file = sk->sk_socket->file;
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 0db8515..86ee8b0 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -257,7 +257,7 @@
{
const struct cred *cred;
- if (sk->sk_state == TCP_TIME_WAIT)
+ if (!sk_fullsock(sk))
return 0;
read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index e99911e..abe6811 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -83,7 +83,7 @@
*(u16 *)dest->data = out->type;
break;
case NFT_META_SKUID:
- if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err;
read_lock_bh(&skb->sk->sk_callback_lock);
@@ -99,7 +99,7 @@
read_unlock_bh(&skb->sk->sk_callback_lock);
break;
case NFT_META_SKGID:
- if (skb->sk == NULL || skb->sk->sk_state == TCP_TIME_WAIT)
+ if (skb->sk == NULL || !sk_fullsock(skb->sk))
goto err;
read_lock_bh(&skb->sk->sk_callback_lock);
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index ef8a926..165b77c 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -42,15 +42,21 @@
static bool tproxy_sk_is_transparent(struct sock *sk)
{
- if (sk->sk_state != TCP_TIME_WAIT) {
- if (inet_sk(sk)->transparent)
- return true;
- sock_put(sk);
- } else {
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
if (inet_twsk(sk)->tw_transparent)
return true;
- inet_twsk_put(inet_twsk(sk));
+ break;
+ case TCP_NEW_SYN_RECV:
+ if (inet_rsk(inet_reqsk(sk))->no_srccheck)
+ return true;
+ break;
+ default:
+ if (inet_sk(sk)->transparent)
+ return true;
}
+
+ sock_gen_put(sk);
return false;
}
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 13332dbf..895534e 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -129,6 +129,20 @@
return NULL;
}
+static bool xt_socket_sk_is_transparent(struct sock *sk)
+{
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
+ return inet_twsk(sk)->tw_transparent;
+
+ case TCP_NEW_SYN_RECV:
+ return inet_rsk(inet_reqsk(sk))->no_srccheck;
+
+ default:
+ return inet_sk(sk)->transparent;
+ }
+}
+
static bool
socket_match(const struct sk_buff *skb, struct xt_action_param *par,
const struct xt_socket_mtinfo1 *info)
@@ -195,16 +209,14 @@
* unless XT_SOCKET_NOWILDCARD is set
*/
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
- sk->sk_state != TCP_TIME_WAIT &&
+ sk_fullsock(sk) &&
inet_sk(sk)->inet_rcv_saddr == 0);
/* Ignore non-transparent sockets,
- if XT_SOCKET_TRANSPARENT is used */
+ * if XT_SOCKET_TRANSPARENT is used
+ */
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = ((sk->sk_state != TCP_TIME_WAIT &&
- inet_sk(sk)->transparent) ||
- (sk->sk_state == TCP_TIME_WAIT &&
- inet_twsk(sk)->tw_transparent));
+ transparent = xt_socket_sk_is_transparent(sk);
if (sk != skb->sk)
sock_gen_put(sk);
@@ -363,16 +375,14 @@
* unless XT_SOCKET_NOWILDCARD is set
*/
wildcard = (!(info->flags & XT_SOCKET_NOWILDCARD) &&
- sk->sk_state != TCP_TIME_WAIT &&
+ sk_fullsock(sk) &&
ipv6_addr_any(&sk->sk_v6_rcv_saddr));
/* Ignore non-transparent sockets,
- if XT_SOCKET_TRANSPARENT is used */
+ * if XT_SOCKET_TRANSPARENT is used
+ */
if (info->flags & XT_SOCKET_TRANSPARENT)
- transparent = ((sk->sk_state != TCP_TIME_WAIT &&
- inet_sk(sk)->transparent) ||
- (sk->sk_state == TCP_TIME_WAIT &&
- inet_twsk(sk)->tw_transparent));
+ transparent = xt_socket_sk_is_transparent(sk);
if (sk != skb->sk)
sock_gen_put(sk);