writeback: split inode_wb_list_lock into bdi_writeback.list_lock

Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads.  It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.

Based on earlier patches from Nick Piggin and Dave Chinner.

It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
                      inode_wb_list_lock:         42590          44433           0.12         147.74      144127.35         252274         886792           0.08         121.34      917211.23
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             34          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock          12893          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock          10702          [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             19          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock           5550          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock           8511          [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157

2.6.39-rc3 + patch:
                &(&wb->list_lock)->rlock:         11383          11657           0.14         151.69       40429.51          90825         527918           0.11         145.90      556843.37
                ------------------------
                &(&wb->list_lock)->rlock             10          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           1493          [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
                &(&wb->list_lock)->rlock           3652          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
                &(&wb->list_lock)->rlock           1412          [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
                ------------------------
                &(&wb->list_lock)->rlock              3          [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
                &(&wb->list_lock)->rlock              6          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           2061          [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
                &(&wb->list_lock)->rlock           2629          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f

hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a2421f..3c9a03e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@
 {
 	return &BDEV_I(inode)->bdev;
 }
-
 EXPORT_SYMBOL(I_BDEV);
 
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
  */
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+	if (unlikely(dst == old))		/* deadlock avoidance */
+		return;
+	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&old->wb.list_lock);
+	spin_unlock(&dst->wb.list_lock);
 }
 
 static sector_t max_block(struct block_device *bdev)