Merge branch 'tcp_dccp_ports'

Eric Dumazet says:

====================
tcp/dccp: better use of ephemeral ports

Big servers have bloated bind table, making very hard to succeed
ephemeral port allocations, without special containers/namespace tricks.

This patch series extends the strategy added in commit 07f4c90062f8
("tcp/dccp: try to not exhaust ip_local_port_range in connect()").

Since ports used by connect() are much likely to be shared among them,
we give a hint to both bind() and connect() to keep the crowds separated
if possible.

Of course, if on a specific host an application needs to allocate ~30000
ports using bind(), it will still be able to do so. Same for ~30000 connect()
to a unique 2-tuple (dst addr, dst port)

New implemetation is also more friendly to softirqs and reschedules.

v2: rebase after TCP SO_REUSEPORT changes
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index c16a2e6..3d28c6d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -91,165 +91,153 @@
 
 /* Obtain a reference to a local port for the given sock,
  * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
  */
 int inet_csk_get_port(struct sock *sk, unsigned short snum)
 {
-	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+	int ret = 1, attempts = 5, port = snum;
+	int smallest_size = -1, smallest_port;
 	struct inet_bind_hashbucket *head;
-	struct inet_bind_bucket *tb;
-	int ret, attempts = 5;
 	struct net *net = sock_net(sk);
-	int smallest_size = -1, smallest_rover;
+	int i, low, high, attempt_half;
+	struct inet_bind_bucket *tb;
 	kuid_t uid = sock_i_uid(sk);
-	int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+	u32 remaining, offset;
 
-	local_bh_disable();
-	if (!snum) {
-		int remaining, rover, low, high;
-
-again:
-		inet_get_local_port_range(net, &low, &high);
-		if (attempt_half) {
-			int half = low + ((high - low) >> 1);
-
-			if (attempt_half == 1)
-				high = half;
-			else
-				low = half;
-		}
-		remaining = (high - low) + 1;
-		smallest_rover = rover = prandom_u32() % remaining + low;
-
-		smallest_size = -1;
-		do {
-			if (inet_is_local_reserved_port(net, rover))
-				goto next_nolock;
-			head = &hashinfo->bhash[inet_bhashfn(net, rover,
-					hashinfo->bhash_size)];
-			spin_lock(&head->lock);
-			inet_bind_bucket_for_each(tb, &head->chain)
-				if (net_eq(ib_net(tb), net) && tb->port == rover) {
-					if (((tb->fastreuse > 0 &&
-					      sk->sk_reuse &&
-					      sk->sk_state != TCP_LISTEN) ||
-					     (tb->fastreuseport > 0 &&
-					      sk->sk_reuseport &&
-					      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-					      uid_eq(tb->fastuid, uid))) &&
-					    (tb->num_owners < smallest_size || smallest_size == -1)) {
-						smallest_size = tb->num_owners;
-						smallest_rover = rover;
-					}
-					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
-						snum = rover;
-						goto tb_found;
-					}
-					goto next;
-				}
-			break;
-		next:
-			spin_unlock(&head->lock);
-		next_nolock:
-			if (++rover > high)
-				rover = low;
-		} while (--remaining > 0);
-
-		/* Exhausted local port range during search?  It is not
-		 * possible for us to be holding one of the bind hash
-		 * locks if this test triggers, because if 'remaining'
-		 * drops to zero, we broke out of the do/while loop at
-		 * the top level, not from the 'break;' statement.
-		 */
-		ret = 1;
-		if (remaining <= 0) {
-			if (smallest_size != -1) {
-				snum = smallest_rover;
-				goto have_snum;
-			}
-			if (attempt_half == 1) {
-				/* OK we now try the upper half of the range */
-				attempt_half = 2;
-				goto again;
-			}
-			goto fail;
-		}
-		/* OK, here is the one we will use.  HEAD is
-		 * non-NULL and we hold it's mutex.
-		 */
-		snum = rover;
-	} else {
-have_snum:
-		head = &hashinfo->bhash[inet_bhashfn(net, snum,
-				hashinfo->bhash_size)];
-		spin_lock(&head->lock);
+	if (port) {
+have_port:
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
 		inet_bind_bucket_for_each(tb, &head->chain)
-			if (net_eq(ib_net(tb), net) && tb->port == snum)
+			if (net_eq(ib_net(tb), net) && tb->port == port)
 				goto tb_found;
+
+		goto tb_not_found;
 	}
-	tb = NULL;
-	goto tb_not_found;
+again:
+	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+	inet_get_local_port_range(net, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	if (high - low < 4)
+		attempt_half = 0;
+	if (attempt_half) {
+		int half = low + (((high - low) >> 2) << 1);
+
+		if (attempt_half == 1)
+			high = half;
+		else
+			low = half;
+	}
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
+
+	offset = prandom_u32() % remaining;
+	/* __inet_hash_connect() favors ports having @low parity
+	 * We do the opposite to not pollute connect() users.
+	 */
+	offset |= 1U;
+	smallest_size = -1;
+	smallest_port = low; /* avoid compiler warning */
+
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+		inet_bind_bucket_for_each(tb, &head->chain)
+			if (net_eq(ib_net(tb), net) && tb->port == port) {
+				if (((tb->fastreuse > 0 && reuse) ||
+				     (tb->fastreuseport > 0 &&
+				      sk->sk_reuseport &&
+				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+				      uid_eq(tb->fastuid, uid))) &&
+				    (tb->num_owners < smallest_size || smallest_size == -1)) {
+					smallest_size = tb->num_owners;
+					smallest_port = port;
+				}
+				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
+					goto tb_found;
+				goto next_port;
+			}
+		goto tb_not_found;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
+	}
+
+	if (smallest_size != -1) {
+		port = smallest_port;
+		goto have_port;
+	}
+	offset--;
+	if (!(offset & 1))
+		goto other_parity_scan;
+
+	if (attempt_half == 1) {
+		/* OK we now try the upper half of the range */
+		attempt_half = 2;
+		goto other_half_scan;
+	}
+	return ret;
+
+tb_not_found:
+	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+				     net, head, port);
+	if (!tb)
+		goto fail_unlock;
 tb_found:
 	if (!hlist_empty(&tb->owners)) {
 		if (sk->sk_reuse == SK_FORCE_REUSE)
 			goto success;
 
-		if (((tb->fastreuse > 0 &&
-		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+		if (((tb->fastreuse > 0 && reuse) ||
 		     (tb->fastreuseport > 0 &&
-		      sk->sk_reuseport &&
-		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-		      uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
+		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
+		    smallest_size == -1)
 			goto success;
-		} else {
-			ret = 1;
-			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
-				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
-				     (tb->fastreuseport > 0 &&
-				      sk->sk_reuseport &&
-				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
-				      uid_eq(tb->fastuid, uid))) &&
-				    smallest_size != -1 && --attempts >= 0) {
-					spin_unlock(&head->lock);
-					goto again;
-				}
-
-				goto fail_unlock;
+		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+			if ((reuse ||
+			     (tb->fastreuseport > 0 &&
+			      sk->sk_reuseport &&
+			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
+			      uid_eq(tb->fastuid, uid))) &&
+			    smallest_size != -1 && --attempts >= 0) {
+				spin_unlock_bh(&head->lock);
+				goto again;
 			}
+			goto fail_unlock;
 		}
-	}
-tb_not_found:
-	ret = 1;
-	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
-					net, head, snum)) == NULL)
-		goto fail_unlock;
-	if (hlist_empty(&tb->owners)) {
-		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
-			tb->fastreuse = 1;
-		else
+		if (!reuse)
 			tb->fastreuse = 0;
+		if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+			tb->fastreuseport = 0;
+	} else {
+		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = 1;
 			tb->fastuid = uid;
-		} else
+		} else {
 			tb->fastreuseport = 0;
-	} else {
-		if (tb->fastreuse &&
-		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
-			tb->fastreuse = 0;
-		if (tb->fastreuseport &&
-		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
-			tb->fastreuseport = 0;
+		}
 	}
 success:
 	if (!inet_csk(sk)->icsk_bind_hash)
-		inet_bind_hash(sk, tb, snum);
+		inet_bind_hash(sk, tb, port);
 	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
 	ret = 0;
 
 fail_unlock:
-	spin_unlock(&head->lock);
-fail:
-	local_bh_enable();
+	spin_unlock_bh(&head->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(inet_csk_get_port);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c0f9942..bc68ece 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -565,106 +565,106 @@
 			struct sock *, __u16, struct inet_timewait_sock **))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
-	const unsigned short snum = inet_sk(sk)->inet_num;
+	struct inet_timewait_sock *tw = NULL;
 	struct inet_bind_hashbucket *head;
-	struct inet_bind_bucket *tb;
-	int ret;
+	int port = inet_sk(sk)->inet_num;
 	struct net *net = sock_net(sk);
+	struct inet_bind_bucket *tb;
+	u32 remaining, offset;
+	int ret, i, low, high;
+	static u32 hint;
 
-	if (!snum) {
-		int i, remaining, low, high, port;
-		static u32 hint;
-		u32 offset = hint + port_offset;
-		struct inet_timewait_sock *tw = NULL;
-
-		inet_get_local_port_range(net, &low, &high);
-		remaining = (high - low) + 1;
-
-		/* By starting with offset being an even number,
-		 * we tend to leave about 50% of ports for other uses,
-		 * like bind(0).
-		 */
-		offset &= ~1;
-
-		local_bh_disable();
-		for (i = 0; i < remaining; i++) {
-			port = low + (i + offset) % remaining;
-			if (inet_is_local_reserved_port(net, port))
-				continue;
-			head = &hinfo->bhash[inet_bhashfn(net, port,
-					hinfo->bhash_size)];
-			spin_lock(&head->lock);
-
-			/* Does not bother with rcv_saddr checks,
-			 * because the established check is already
-			 * unique enough.
-			 */
-			inet_bind_bucket_for_each(tb, &head->chain) {
-				if (net_eq(ib_net(tb), net) &&
-				    tb->port == port) {
-					if (tb->fastreuse >= 0 ||
-					    tb->fastreuseport >= 0)
-						goto next_port;
-					WARN_ON(hlist_empty(&tb->owners));
-					if (!check_established(death_row, sk,
-								port, &tw))
-						goto ok;
-					goto next_port;
-				}
-			}
-
-			tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
-					net, head, port);
-			if (!tb) {
-				spin_unlock(&head->lock);
-				break;
-			}
-			tb->fastreuse = -1;
-			tb->fastreuseport = -1;
-			goto ok;
-
-		next_port:
-			spin_unlock(&head->lock);
+	if (port) {
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		tb = inet_csk(sk)->icsk_bind_hash;
+		spin_lock_bh(&head->lock);
+		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+			inet_ehash_nolisten(sk, NULL);
+			spin_unlock_bh(&head->lock);
+			return 0;
 		}
-		local_bh_enable();
-
-		return -EADDRNOTAVAIL;
-
-ok:
-		hint += (i + 2) & ~1;
-
-		/* Head lock still held and bh's disabled */
-		inet_bind_hash(sk, tb, port);
-		if (sk_unhashed(sk)) {
-			inet_sk(sk)->inet_sport = htons(port);
-			inet_ehash_nolisten(sk, (struct sock *)tw);
-		}
-		if (tw)
-			inet_twsk_bind_unhash(tw, hinfo);
-		spin_unlock(&head->lock);
-
-		if (tw)
-			inet_twsk_deschedule_put(tw);
-
-		ret = 0;
-		goto out;
-	}
-
-	head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
-	tb  = inet_csk(sk)->icsk_bind_hash;
-	spin_lock_bh(&head->lock);
-	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		inet_ehash_nolisten(sk, NULL);
-		spin_unlock_bh(&head->lock);
-		return 0;
-	} else {
 		spin_unlock(&head->lock);
 		/* No definite answer... Walk to established hash table */
-		ret = check_established(death_row, sk, snum, NULL);
-out:
+		ret = check_established(death_row, sk, port, NULL);
 		local_bh_enable();
 		return ret;
 	}
+
+	inet_get_local_port_range(net, &low, &high);
+	high++; /* [32768, 60999] -> [32768, 61000[ */
+	remaining = high - low;
+	if (likely(remaining > 1))
+		remaining &= ~1U;
+
+	offset = (hint + port_offset) % remaining;
+	/* In first pass we try ports of @low parity.
+	 * inet_csk_get_port() does the opposite choice.
+	 */
+	offset &= ~1U;
+other_parity_scan:
+	port = low + offset;
+	for (i = 0; i < remaining; i += 2, port += 2) {
+		if (unlikely(port >= high))
+			port -= remaining;
+		if (inet_is_local_reserved_port(net, port))
+			continue;
+		head = &hinfo->bhash[inet_bhashfn(net, port,
+						  hinfo->bhash_size)];
+		spin_lock_bh(&head->lock);
+
+		/* Does not bother with rcv_saddr checks, because
+		 * the established check is already unique enough.
+		 */
+		inet_bind_bucket_for_each(tb, &head->chain) {
+			if (net_eq(ib_net(tb), net) && tb->port == port) {
+				if (tb->fastreuse >= 0 ||
+				    tb->fastreuseport >= 0)
+					goto next_port;
+				WARN_ON(hlist_empty(&tb->owners));
+				if (!check_established(death_row, sk,
+						       port, &tw))
+					goto ok;
+				goto next_port;
+			}
+		}
+
+		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+					     net, head, port);
+		if (!tb) {
+			spin_unlock_bh(&head->lock);
+			return -ENOMEM;
+		}
+		tb->fastreuse = -1;
+		tb->fastreuseport = -1;
+		goto ok;
+next_port:
+		spin_unlock_bh(&head->lock);
+		cond_resched();
+	}
+
+	offset++;
+	if ((offset & 1) && remaining > 1)
+		goto other_parity_scan;
+
+	return -EADDRNOTAVAIL;
+
+ok:
+	hint += i + 2;
+
+	/* Head lock still held and bh's disabled */
+	inet_bind_hash(sk, tb, port);
+	if (sk_unhashed(sk)) {
+		inet_sk(sk)->inet_sport = htons(port);
+		inet_ehash_nolisten(sk, (struct sock *)tw);
+	}
+	if (tw)
+		inet_twsk_bind_unhash(tw, hinfo);
+	spin_unlock(&head->lock);
+	if (tw)
+		inet_twsk_deschedule_put(tw);
+	local_bh_enable();
+	return 0;
 }
 
 /*