vfs: revert per-cpu nr_unused counters for dentry and inodes

The nr_unused counters count the number of objects on an LRU, and as such they
are synchronized with LRU object insertion and removal and scanning, and
protected under the LRU lock.

Making it per-cpu does not actually get any concurrency improvements because of
this lock, and summing the counter is much slower, and
incrementing/decrementing it costs more code size and is slower too.

These counters should stay per-LRU, which currently means global.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
diff --git a/fs/dcache.c b/fs/dcache.c
index 9d1a59df..f62ba90 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -68,14 +68,12 @@
 };
 
 static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
-	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -140,7 +138,7 @@
 	if (list_empty(&dentry->d_lru)) {
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	}
 }
 
@@ -149,7 +147,7 @@
 	if (!list_empty(&dentry->d_lru)) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+		dentry_stat.nr_unused--;
 	}
 }
 
@@ -158,7 +156,7 @@
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		dentry_stat.nr_unused++;
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
@@ -546,7 +544,7 @@
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = dentry_stat.nr_unused;
 	int prune_ratio;
 	int pruned;
 
@@ -908,16 +906,13 @@
  */
 static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
-	int nr_unused;
-
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
-	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static struct shrinker dcache_shrinker = {
@@ -2424,7 +2419,6 @@
 	int loop;
 
 	percpu_counter_init(&nr_dentry, 0);
-	percpu_counter_init(&nr_dentry_unused, 0);
 
 	/* 
 	 * A constructor could be added for stable state like the lists,