Merge branch 'rhashtable-next'

Ying Xue says:

====================
remove nl_sk_hash_lock from netlink socket

After tipc socket successfully avoids the involvement of an extra lock
with rhashtable_lookup_insert(), it's possible for netlink socket to
remove its hash socket lock now. But as netlink socket needs a compare
function to look for an object, we first introduce a new function
called rhashtable_lookup_compare_insert() in commit #1 which is
implemented based on original rhashtable_lookup_insert(). We
subsequently remove nl_sk_hash_lock from netlink socket with the new
introduced function in commit #2. Lastly, as Thomas requested, we add
commit #3 to indicate the implementation of what the grow and shrink
decision function must enforce min/max shift.

v2:
 As Thomas pointed out, there was a race between checking portid and
 then setting it in commit #2. Now use socket lock to make the process
 of both checking and setting portid atomic, and then eliminate the
 race.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 326acd8..9570832 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -79,6 +79,10 @@
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
+ *
+ * Note: when implementing the grow and shrink decision function, min/max
+ * shift must be enforced, otherwise, resizing watermarks they set may be
+ * useless.
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
@@ -168,7 +172,12 @@
 void *rhashtable_lookup(struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
+
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj);
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg);
 
 void rhashtable_destroy(struct rhashtable *ht);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8023b55..ed6ae1a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -727,6 +727,43 @@
  */
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = rht_obj(ht, obj) + ht->p.key_offset,
+	};
+
+	BUG_ON(!ht->p.key_len);
+
+	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
+						&arg);
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
+/**
+ * rhashtable_lookup_compare_insert - search and insert object to hash table
+ *                                    with compare function
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @compare:	compare function, must return true on match
+ * @arg:	argument passed on to compare function
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg)
+{
 	struct bucket_table *new_tbl, *old_tbl;
 	spinlock_t *new_bucket_lock, *old_bucket_lock;
 	u32 new_hash, old_hash;
@@ -747,7 +784,8 @@
 	if (unlikely(old_tbl != new_tbl))
 		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
 
-	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
+				      compare, arg)) {
 		success = false;
 		goto exit;
 	}
@@ -763,7 +801,7 @@
 
 	return success;
 }
-EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 298e1df..01b702d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -98,7 +98,7 @@
 
 /* nl_table locking explained:
  * Lookup and traversal are protected with an RCU read-side lock. Insertion
- * and removal are protected with nl_sk_hash_lock while using RCU list
+ * and removal are protected with per bucket lock while using RCU list
  * modification primitives and may run in parallel to RCU protected lookups.
  * Destruction of the Netlink socket may only occur *after* nl_table_lock has
  * been acquired * either during or after the socket has been removed from
@@ -110,10 +110,6 @@
 
 #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
 
-/* Protects netlink socket hash table mutations */
-DEFINE_MUTEX(nl_sk_hash_lock);
-EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static DEFINE_SPINLOCK(netlink_tap_lock);
@@ -998,6 +994,19 @@
 					 &netlink_compare, &arg);
 }
 
+static bool __netlink_insert(struct netlink_table *table, struct sock *sk,
+			     struct net *net)
+{
+	struct netlink_compare_arg arg = {
+		.net = net,
+		.portid = nlk_sk(sk)->portid,
+	};
+
+	return rhashtable_lookup_compare_insert(&table->hash,
+						&nlk_sk(sk)->node,
+						&netlink_compare, &arg);
+}
+
 static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
 {
 	struct netlink_table *table = &nl_table[protocol];
@@ -1043,9 +1052,7 @@
 	struct netlink_table *table = &nl_table[sk->sk_protocol];
 	int err = -EADDRINUSE;
 
-	mutex_lock(&nl_sk_hash_lock);
-	if (__netlink_lookup(table, portid, net))
-		goto err;
+	lock_sock(sk);
 
 	err = -EBUSY;
 	if (nlk_sk(sk)->portid)
@@ -1058,10 +1065,12 @@
 
 	nlk_sk(sk)->portid = portid;
 	sock_hold(sk);
-	rhashtable_insert(&table->hash, &nlk_sk(sk)->node);
-	err = 0;
+	if (__netlink_insert(table, sk, net))
+		err = 0;
+	else
+		sock_put(sk);
 err:
-	mutex_unlock(&nl_sk_hash_lock);
+	release_sock(sk);
 	return err;
 }
 
@@ -1069,13 +1078,11 @@
 {
 	struct netlink_table *table;
 
-	mutex_lock(&nl_sk_hash_lock);
 	table = &nl_table[sk->sk_protocol];
 	if (rhashtable_remove(&table->hash, &nlk_sk(sk)->node)) {
 		WARN_ON(atomic_read(&sk->sk_refcnt) == 1);
 		__sock_put(sk);
 	}
-	mutex_unlock(&nl_sk_hash_lock);
 
 	netlink_table_grab();
 	if (nlk_sk(sk)->subscriptions) {
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index fd96fa7..7518375 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -74,6 +74,5 @@
 
 extern struct netlink_table *nl_table;
 extern rwlock_t nl_table_lock;
-extern struct mutex nl_sk_hash_lock;
 
 #endif
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index fcca36d..bb59a7e 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -103,7 +103,7 @@
 {
 	struct netlink_table *tbl = &nl_table[protocol];
 	struct rhashtable *ht = &tbl->hash;
-	const struct bucket_table *htbl = rht_dereference(ht->tbl, ht);
+	const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
 	struct net *net = sock_net(skb->sk);
 	struct netlink_diag_req *req;
 	struct netlink_sock *nlsk;
@@ -115,7 +115,7 @@
 	for (i = 0; i < htbl->size; i++) {
 		struct rhash_head *pos;
 
-		rht_for_each_entry(nlsk, pos, htbl, i, node) {
+		rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
 			sk = (struct sock *)nlsk;
 
 			if (!net_eq(sock_net(sk), net))
@@ -172,7 +172,7 @@
 
 	req = nlmsg_data(cb->nlh);
 
-	mutex_lock(&nl_sk_hash_lock);
+	rcu_read_lock();
 	read_lock(&nl_table_lock);
 
 	if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
@@ -186,7 +186,7 @@
 	} else {
 		if (req->sdiag_protocol >= MAX_LINKS) {
 			read_unlock(&nl_table_lock);
-			mutex_unlock(&nl_sk_hash_lock);
+			rcu_read_unlock();
 			return -ENOENT;
 		}
 
@@ -194,7 +194,7 @@
 	}
 
 	read_unlock(&nl_table_lock);
-	mutex_unlock(&nl_sk_hash_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 }