Merge branch 'tcp-preempt'
Eric Dumazet says:
====================
net: make TCP preemptible
Most of TCP stack assumed it was running from BH handler.
This is great for most things, as TCP behavior is very sensitive
to scheduling artifacts.
However, the prequeue and backlog processing are problematic,
as they need to be flushed with BH being blocked.
To cope with modern needs, TCP sockets have big sk_rcvbuf values,
in the order of 16 MB, and soon 32 MB.
This means that backlog can hold thousands of packets, and things
like TCP coalescing or collapsing on this amount of packets can
lead to insane latency spikes, since BH are blocked for too long.
It is time to make UDP/TCP stacks preemptible.
Note that fast path still runs from BH handler.
v2: Added "tcp: make tcp_sendmsg() aware of socket backlog"
to reduce latency problems of large sends.
v3: Fixed a typo in tcp_cdg.c
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/sock.h b/include/net/sock.h
index 3df778c..1dbb1f9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -926,6 +926,17 @@
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);
+void __sk_flush_backlog(struct sock *sk);
+
+static inline bool sk_flush_backlog(struct sock *sk)
+{
+ if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
+ __sk_flush_backlog(sk);
+ return true;
+ }
+ return false;
+}
+
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
struct request_sock_ops;
diff --git a/net/core/sock.c b/net/core/sock.c
index e16a5db..f615e93 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2019,33 +2019,27 @@
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
- struct sk_buff *skb = sk->sk_backlog.head;
+ struct sk_buff *skb, *next;
- do {
+ while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
- bh_unlock_sock(sk);
+
+ spin_unlock_bh(&sk->sk_lock.slock);
do {
- struct sk_buff *next = skb->next;
-
+ next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
- /*
- * We are in process context here with softirqs
- * disabled, use cond_resched_softirq() to preempt.
- * This is safe to do because we've taken the backlog
- * queue private:
- */
- cond_resched_softirq();
+ cond_resched();
skb = next;
} while (skb != NULL);
- bh_lock_sock(sk);
- } while ((skb = sk->sk_backlog.head) != NULL);
+ spin_lock_bh(&sk->sk_lock.slock);
+ }
/*
* Doing the zeroing here guarantee we can not loop forever
@@ -2054,6 +2048,13 @@
sk->sk_backlog.len = 0;
}
+void __sk_flush_backlog(struct sock *sk)
+{
+ spin_lock_bh(&sk->sk_lock.slock);
+ __release_sock(sk);
+ spin_unlock_bh(&sk->sk_lock.slock);
+}
+
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
diff --git a/net/dccp/input.c b/net/dccp/input.c
index 2437ecc..ba34718 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -359,7 +359,7 @@
goto discard;
}
- __DCCP_INC_STATS(DCCP_MIB_INERRS);
+ DCCP_INC_STATS(DCCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index a816427..5c7e413 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -533,8 +533,8 @@
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
- __DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- __DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
}
out:
dst_release(dst);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 0f4eb4e..d176f4e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -277,8 +277,8 @@
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
- __DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- __DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
+ DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
}
diff --git a/net/dccp/options.c b/net/dccp/options.c
index b82b7ee..74d29c5 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -253,7 +253,7 @@
return 0;
out_invalid_option:
- __DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
+ DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
rc = DCCP_RESET_CODE_OPTION_ERROR;
out_featneg_failed:
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index cb4d1ca..b945c2b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1136,11 +1136,12 @@
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- mss_now = tcp_send_mss(sk, &size_goal, flags);
-
/* Ok commence sending. */
copied = 0;
+restart:
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
@@ -1166,6 +1167,9 @@
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
+ if (sk_flush_backlog(sk))
+ goto restart;
+
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation,
@@ -1449,12 +1453,8 @@
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
- /* RX process wants to run with disabled BHs, though it is not
- * necessary */
- local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
- local_bh_enable();
/* Clear memory counter. */
tp->ucopy.memory = 0;
@@ -3095,7 +3095,7 @@
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 3c00208..ccce8a5 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -155,11 +155,11 @@
ca->last_ack = now_us;
if (after(now_us, ca->round_start + base_owd)) {
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- __NET_ADD_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
return;
}
@@ -174,11 +174,11 @@
125U);
if (ca->rtt.min > thresh) {
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- __NET_ADD_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 59155af..0ce946e 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -402,11 +402,11 @@
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- __NET_ADD_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -423,11 +423,11 @@
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- __NET_ADD_STATS(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index a1498d5..54d9f9b 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -255,9 +255,9 @@
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
- spin_unlock(&fastopenq->lock);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
+ spin_unlock(&fastopenq->lock);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
@@ -282,7 +282,7 @@
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
@@ -311,13 +311,13 @@
child = tcp_fastopen_create_child(sk, skb, dst, req);
if (child) {
foc->len = -1;
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
valid_foc.exp = foc->exp;
*foc = valid_foc;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1fb19c9..6171f92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -869,7 +869,7 @@
else
mib_idx = LINUX_MIB_TCPSACKREORDER;
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -1062,7 +1062,7 @@
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = true;
tcp_dsack_seen(tp);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1071,7 +1071,7 @@
!before(start_seq_0, start_seq_1)) {
dup_sack = true;
tcp_dsack_seen(tp);
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
}
}
@@ -1289,7 +1289,7 @@
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
@@ -1314,7 +1314,7 @@
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
return true;
}
@@ -1473,7 +1473,7 @@
return skb;
fallback:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
return NULL;
}
@@ -1661,7 +1661,7 @@
mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
@@ -1913,7 +1913,7 @@
skb = tcp_write_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2399,7 +2399,7 @@
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
@@ -2421,7 +2421,7 @@
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
return true;
}
return false;
@@ -2436,9 +2436,9 @@
tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp))
@@ -2563,7 +2563,7 @@
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2583,7 +2583,7 @@
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2647,7 +2647,7 @@
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
@@ -2740,7 +2740,7 @@
DBGUNDO(sk, "partial recovery");
tcp_undo_cwnd_reduction(sk, true);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
return true;
}
@@ -3434,7 +3434,7 @@
s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
- __NET_INC_STATS(net, mib_idx);
+ NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
@@ -3467,7 +3467,7 @@
challenge_count = 0;
}
if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
@@ -3516,7 +3516,7 @@
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPLOSSPROBERECOVERY);
} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
@@ -3621,14 +3621,14 @@
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
@@ -4131,7 +4131,7 @@
else
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
@@ -4155,7 +4155,7 @@
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
@@ -4305,7 +4305,7 @@
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
@@ -4393,7 +4393,7 @@
tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
tcp_drop(sk, skb);
return;
}
@@ -4402,7 +4402,7 @@
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4457,7 +4457,7 @@
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
@@ -4496,7 +4496,7 @@
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
tcp_drop(sk, skb1);
}
@@ -4611,14 +4611,12 @@
__set_current_state(TASK_RUNNING);
- local_bh_enable();
if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
}
if (eaten <= 0) {
@@ -4661,7 +4659,7 @@
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
@@ -4707,7 +4705,7 @@
__skb_unlink(skb, list);
__kfree_skb(skb);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
@@ -4866,7 +4864,7 @@
bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -4895,7 +4893,7 @@
SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
@@ -4925,7 +4923,7 @@
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
@@ -5134,7 +5132,6 @@
int chunk = skb->len - hlen;
int err;
- local_bh_enable();
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
@@ -5146,32 +5143,9 @@
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
return err;
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- __sum16 result;
-
- if (sock_owned_by_user(sk)) {
- local_bh_enable();
- result = __tcp_checksum_complete(skb);
- local_bh_disable();
- } else {
- result = __tcp_checksum_complete(skb);
- }
- return result;
-}
-
-static inline bool tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- return !skb_csum_unnecessary(skb) &&
- __tcp_checksum_complete_user(sk, skb);
-}
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5184,7 +5158,7 @@
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
@@ -5236,8 +5210,8 @@
if (th->syn) {
syn_challenge:
if (syn_inerr)
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
goto discard;
}
@@ -5352,7 +5326,7 @@
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
@@ -5380,13 +5354,13 @@
__skb_pull(skb, tcp_header_len);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
}
if (!eaten) {
- if (tcp_checksum_complete_user(sk, skb))
+ if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
@@ -5403,7 +5377,7 @@
tcp_rcv_rtt_measure_ts(sk, skb);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
@@ -5430,7 +5404,7 @@
}
slow_path:
- if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
@@ -5460,8 +5434,8 @@
return;
csum_error:
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
tcp_drop(sk, skb);
@@ -5553,13 +5527,13 @@
break;
}
tcp_rearm_rto(sk);
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVE);
tcp_fastopen_add_skb(sk, synack);
@@ -5595,7 +5569,7 @@
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp)) {
- __NET_INC_STATS(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
@@ -5965,7 +5939,7 @@
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
@@ -6022,7 +5996,7 @@
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
@@ -6224,7 +6198,7 @@
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6271,7 +6245,7 @@
if (dst && strict &&
!tcp_peer_is_proven(req, dst, true,
tmp_opt.saw_tstamp)) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 87b173b..761bc49 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -692,6 +692,7 @@
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ preempt_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -699,6 +700,7 @@
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ preempt_enable();
#ifdef CONFIG_TCP_MD5SIG
out:
@@ -774,12 +776,14 @@
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ preempt_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ preempt_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -1151,12 +1155,12 @@
return false;
if (hash_expected && !hash_location) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
@@ -1342,7 +1346,7 @@
return newsk;
exit_overflow:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
@@ -1432,8 +1436,8 @@
return 0;
csum_err:
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ffbfecd..4b95ec4 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -337,7 +337,7 @@
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1a487ff9..25d5279 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2221,14 +2221,13 @@
/* Thanks to skb fast clones, we can detect if a prior transmit of
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
- * Note: This is called from BH context only.
*/
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
}
return false;
@@ -2290,7 +2289,7 @@
tp->tlp_high_seq = tp->snd_nxt;
probe_sent:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
@@ -2699,7 +2698,7 @@
tp->retrans_stamp = tcp_skb_timestamp(skb);
} else if (err != -EBUSY) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2823,7 +2822,7 @@
if (tcp_retransmit_skb(sk, skb, segs))
return;
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index e0d0afa..e36df4f 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -65,8 +65,8 @@
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 35f643d..debdd8b 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -162,8 +162,8 @@
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (tp->syn_data && icsk->icsk_retransmits == 1)
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
@@ -178,8 +178,8 @@
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
- __NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -209,6 +209,7 @@
return 0;
}
+/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -493,6 +494,7 @@
out:;
}
+/* Called with BH disabled */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 093284c..f67f52b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1514,9 +1514,9 @@
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5291471..7bdc9c9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -649,12 +649,12 @@
return false;
if (hash_expected && !hash_location) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
@@ -825,9 +825,9 @@
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
- __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
- __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
return;
}
@@ -1276,8 +1276,8 @@
kfree_skb(skb);
return 0;
csum_err:
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1ba5a74..f911c63 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -570,9 +570,9 @@
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- __UDP6_INC_STATS(sock_net(sk),
+ UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
- __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index b335ffc..9d87bba 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -89,10 +89,12 @@
* Eventually, we should clean up inqueue to not rely
* on the BH related data structures.
*/
+ local_bh_disable();
list_add_tail(&chunk->list, &q->in_chunk_list);
if (chunk->asoc)
chunk->asoc->stats.ipackets++;
q->immediate.func(&q->immediate);
+ local_bh_enable();
}
/* Peek at the next chunk on the inqeue. */