mlx4_core: Keep free count for MTT buddy allocator

MTT entries are allocated with a buddy allocator, which just keeps
bitmaps for each level of the buddy table.  However, all free space
starts out at the highest order, and small allocations start scanning
from the lowest order.  When the lowest order tables have no free
space, this can lead to scanning potentially millions of bits before
finding a free entry at a higher order.

We can avoid this by just keeping a count of how many free entries
each order has, and skipping the bitmap scan when an order is
completely empty.  This provides a nice performance boost for a
negligible increase in memory usage.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index a4023c2..7803849 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -118,6 +118,7 @@
 
 struct mlx4_buddy {
 	unsigned long	      **bits;
+	unsigned int	       *num_free;
 	int			max_order;
 	spinlock_t		lock;
 };
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 03a9abc..b3ea93b 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -79,23 +79,26 @@
 
 	spin_lock(&buddy->lock);
 
-	for (o = order; o <= buddy->max_order; ++o) {
-		m = 1 << (buddy->max_order - o);
-		seg = find_first_bit(buddy->bits[o], m);
-		if (seg < m)
-			goto found;
-	}
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
 
 	spin_unlock(&buddy->lock);
 	return -1;
 
  found:
 	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
 
 	while (o > order) {
 		--o;
 		seg <<= 1;
 		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
 	}
 
 	spin_unlock(&buddy->lock);
@@ -113,11 +116,13 @@
 
 	while (test_bit(seg ^ 1, buddy->bits[order])) {
 		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
 		seg >>= 1;
 		++order;
 	}
 
 	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
 
 	spin_unlock(&buddy->lock);
 }
@@ -131,7 +136,9 @@
 
 	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
 			      GFP_KERNEL);
-	if (!buddy->bits)
+	buddy->num_free = kzalloc((buddy->max_order + 1) * sizeof (int *),
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
 		goto err_out;
 
 	for (i = 0; i <= buddy->max_order; ++i) {
@@ -143,6 +150,7 @@
 	}
 
 	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
 
 	return 0;
 
@@ -150,9 +158,10 @@
 	for (i = 0; i <= buddy->max_order; ++i)
 		kfree(buddy->bits[i]);
 
-	kfree(buddy->bits);
-
 err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
 	return -ENOMEM;
 }
 
@@ -164,6 +173,7 @@
 		kfree(buddy->bits[i]);
 
 	kfree(buddy->bits);
+	kfree(buddy->num_free);
 }
 
 static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order)