[XFS] On machines with more than 8 cpus, when running parallel I/O
threads, the incore superblock lock becomes the limiting factor for
buffered write throughput. Make the contended fields in the incore
superblock use per-cpu counters so that there is no global lock to limit
scalability.

SGI-PV: 946630
SGI-Modid: xfs-linux-melb:xfs-kern:25106a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 62188ea..9b43b7b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -51,11 +51,31 @@
 STATIC void	xfs_uuid_unmount(xfs_mount_t *mp);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
+
+#ifdef HAVE_PERCPU_SB
+STATIC void	xfs_icsb_destroy_counters(xfs_mount_t *);
+STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int);
+STATIC void	xfs_icsb_sync_counters(xfs_mount_t *);
+STATIC int	xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
+						int, int);
+STATIC int	xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t,
+						int, int);
+
+#else
+
+#define xfs_icsb_destroy_counters(mp)			do { } while (0)
+#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
+#define xfs_icsb_sync_counters(mp)			do { } while (0)
+#define xfs_icsb_modify_counters(mp, a, b, c)		do { } while (0)
+#define xfs_icsb_modify_counters_locked(mp, a, b, c)	do { } while (0)
+
+#endif
+
 static const struct {
-    short offset;
-    short type;     /* 0 = integer
-		* 1 = binary / string (no translation)
-		*/
+	short offset;
+	short type;	/* 0 = integer
+			 * 1 = binary / string (no translation)
+			 */
 } xfs_sb_info[] = {
     { offsetof(xfs_sb_t, sb_magicnum),   0 },
     { offsetof(xfs_sb_t, sb_blocksize),  0 },
@@ -113,7 +133,11 @@
 {
 	xfs_mount_t *mp;
 
-	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
+	mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
+
+	if (xfs_icsb_init_counters(mp)) {
+		mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+	}
 
 	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
 	spinlock_init(&mp->m_sb_lock, "xfs_sb");
@@ -136,8 +160,8 @@
  */
 void
 xfs_mount_free(
-	xfs_mount_t *mp,
-	int	    remove_bhv)
+	xfs_mount_t	*mp,
+	int		remove_bhv)
 {
 	if (mp->m_ihash)
 		xfs_ihash_free(mp);
@@ -177,6 +201,7 @@
 		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
 	}
 
+	xfs_icsb_destroy_counters(mp);
 	kmem_free(mp, sizeof(xfs_mount_t));
 }
 
@@ -527,6 +552,10 @@
 		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 	}
 
+	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
+	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
+
 	mp->m_sb_bp = bp;
 	xfs_buf_relse(bp);
 	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
@@ -1154,6 +1183,9 @@
 	sbp = xfs_getsb(mp, 0);
 	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
 		XFS_FORCED_SHUTDOWN(mp))) {
+
+		xfs_icsb_sync_counters(mp);
+
 		/*
 		 * mark shared-readonly if desired
 		 */
@@ -1227,7 +1259,6 @@
 
 	xfs_trans_log_buf(tp, bp, first, last);
 }
-
 /*
  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
  * a delta to a specified field in the in-core superblock.  Simply
@@ -1237,7 +1268,7 @@
  *
  * The SB_LOCK must be held when this routine is called.
  */
-STATIC int
+int
 xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 			int delta, int rsvd)
 {
@@ -1406,9 +1437,26 @@
 	unsigned long	s;
 	int	status;
 
-	s = XFS_SB_LOCK(mp);
-	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-	XFS_SB_UNLOCK(mp, s);
+	/* check for per-cpu counters */
+	switch (field) {
+#ifdef HAVE_PERCPU_SB
+	case XFS_SBS_ICOUNT:
+	case XFS_SBS_IFREE:
+	case XFS_SBS_FDBLOCKS:
+		if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+			status = xfs_icsb_modify_counters(mp, field,
+							delta, rsvd);
+			break;
+		}
+		/* FALLTHROUGH */
+#endif
+	default:
+		s = XFS_SB_LOCK(mp);
+		status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+		XFS_SB_UNLOCK(mp, s);
+		break;
+	}
+
 	return status;
 }
 
@@ -1445,8 +1493,26 @@
 		 * from the loop so we'll fall into the undo loop
 		 * below.
 		 */
-		status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-						    msbp->msb_delta, rsvd);
+		switch (msbp->msb_field) {
+#ifdef HAVE_PERCPU_SB
+		case XFS_SBS_ICOUNT:
+		case XFS_SBS_IFREE:
+		case XFS_SBS_FDBLOCKS:
+			if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+				status = xfs_icsb_modify_counters_locked(mp,
+							msbp->msb_field,
+							msbp->msb_delta, rsvd);
+				break;
+			}
+			/* FALLTHROUGH */
+#endif
+		default:
+			status = xfs_mod_incore_sb_unlocked(mp,
+						msbp->msb_field,
+						msbp->msb_delta, rsvd);
+			break;
+		}
+
 		if (status != 0) {
 			break;
 		}
@@ -1463,8 +1529,28 @@
 	if (status != 0) {
 		msbp--;
 		while (msbp >= msb) {
-			status = xfs_mod_incore_sb_unlocked(mp,
-				    msbp->msb_field, -(msbp->msb_delta), rsvd);
+			switch (msbp->msb_field) {
+#ifdef HAVE_PERCPU_SB
+			case XFS_SBS_ICOUNT:
+			case XFS_SBS_IFREE:
+			case XFS_SBS_FDBLOCKS:
+				if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
+					status =
+					    xfs_icsb_modify_counters_locked(mp,
+							msbp->msb_field,
+							-(msbp->msb_delta),
+							rsvd);
+					break;
+				}
+				/* FALLTHROUGH */
+#endif
+			default:
+				status = xfs_mod_incore_sb_unlocked(mp,
+							msbp->msb_field,
+							-(msbp->msb_delta),
+							rsvd);
+				break;
+			}
 			ASSERT(status == 0);
 			msbp--;
 		}
@@ -1577,3 +1663,445 @@
 	xfs_mod_sb(tp, fields);
 	xfs_trans_commit(tp, 0, NULL);
 }
+
+
+#ifdef HAVE_PERCPU_SB
+/*
+ * Per-cpu incore superblock counters
+ *
+ * Simple concept, difficult implementation
+ *
+ * Basically, replace the incore superblock counters with a distributed per cpu
+ * counter for contended fields (e.g.  free block count).
+ *
+ * Difficulties arise in that the incore sb is used for ENOSPC checking, and
+ * hence needs to be accurately read when we are running low on space. Hence
+ * there is a method to enable and disable the per-cpu counters based on how
+ * much "stuff" is available in them.
+ *
+ * Basically, a counter is enabled if there is enough free resource to justify
+ * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
+ * ENOSPC), then we disable the counters to synchronise all callers and
+ * re-distribute the available resources.
+ *
+ * If, once we redistributed the available resources, we still get a failure,
+ * we disable the per-cpu counter and go through the slow path.
+ *
+ * The slow path is the current xfs_mod_incore_sb() function.  This means that
+ * when we disable a per-cpu counter, we need to drain it's resources back to
+ * the global superblock. We do this after disabling the counter to prevent
+ * more threads from queueing up on the counter.
+ *
+ * Essentially, this means that we still need a lock in the fast path to enable
+ * synchronisation between the global counters and the per-cpu counters. This
+ * is not a problem because the lock will be local to a CPU almost all the time
+ * and have little contention except when we get to ENOSPC conditions.
+ *
+ * Basically, this lock becomes a barrier that enables us to lock out the fast
+ * path while we do things like enabling and disabling counters and
+ * synchronising the counters.
+ *
+ * Locking rules:
+ *
+ * 	1. XFS_SB_LOCK() before picking up per-cpu locks
+ * 	2. per-cpu locks always picked up via for_each_online_cpu() order
+ * 	3. accurate counter sync requires XFS_SB_LOCK + per cpu locks
+ * 	4. modifying per-cpu counters requires holding per-cpu lock
+ * 	5. modifying global counters requires holding XFS_SB_LOCK
+ *	6. enabling or disabling a counter requires holding the XFS_SB_LOCK
+ *	   and _none_ of the per-cpu locks.
+ *
+ * Disabled counters are only ever re-enabled by a balance operation
+ * that results in more free resources per CPU than a given threshold.
+ * To ensure counters don't remain disabled, they are rebalanced when
+ * the global resource goes above a higher threshold (i.e. some hysteresis
+ * is present to prevent thrashing).
+ *
+ * Note: hotplug CPUs not yet supported
+ */
+int
+xfs_icsb_init_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
+	if (mp->m_sb_cnts == NULL)
+		return -ENOMEM;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		spin_lock_init(&cntp->icsb_lock);
+	}
+	/*
+	 * start with all counters disabled so that the
+	 * initial balance kicks us off correctly
+	 */
+	mp->m_icsb_counters = -1;
+	return 0;
+}
+
+STATIC void
+xfs_icsb_destroy_counters(
+	xfs_mount_t	*mp)
+{
+	if (mp->m_sb_cnts)
+		free_percpu(mp->m_sb_cnts);
+}
+
+
+STATIC inline void
+xfs_icsb_lock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		spin_lock(&cntp->icsb_lock);
+	}
+}
+
+STATIC inline void
+xfs_icsb_unlock_all_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		spin_unlock(&cntp->icsb_lock);
+	}
+}
+
+STATIC void
+xfs_icsb_count(
+	xfs_mount_t	*mp,
+	xfs_icsb_cnts_t	*cnt,
+	int		flags)
+{
+	xfs_icsb_cnts_t *cntp;
+	int		i;
+
+	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
+
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_lock_all_counters(mp);
+
+	for_each_online_cpu(i) {
+		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
+		cnt->icsb_icount += cntp->icsb_icount;
+		cnt->icsb_ifree += cntp->icsb_ifree;
+		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
+	}
+
+	if (!(flags & XFS_ICSB_LAZY_COUNT))
+		xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC int
+xfs_icsb_counter_disabled(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+	return test_bit(field, &mp->m_icsb_counters);
+}
+
+STATIC int
+xfs_icsb_disable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field)
+{
+	xfs_icsb_cnts_t	cnt;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	xfs_icsb_lock_all_counters(mp);
+	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
+		/* drain back to superblock */
+
+		xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
+		switch(field) {
+		case XFS_SBS_ICOUNT:
+			mp->m_sb.sb_icount = cnt.icsb_icount;
+			break;
+		case XFS_SBS_IFREE:
+			mp->m_sb.sb_ifree = cnt.icsb_ifree;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+			break;
+		default:
+			BUG();
+		}
+	}
+
+	xfs_icsb_unlock_all_counters(mp);
+
+	return 0;
+}
+
+STATIC void
+xfs_icsb_enable_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	uint64_t	count,
+	uint64_t	resid)
+{
+	xfs_icsb_cnts_t	*cntp;
+	int		i;
+
+	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
+
+	xfs_icsb_lock_all_counters(mp);
+	for_each_online_cpu(i) {
+		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
+		switch (field) {
+		case XFS_SBS_ICOUNT:
+			cntp->icsb_icount = count + resid;
+			break;
+		case XFS_SBS_IFREE:
+			cntp->icsb_ifree = count + resid;
+			break;
+		case XFS_SBS_FDBLOCKS:
+			cntp->icsb_fdblocks = count + resid;
+			break;
+		default:
+			BUG();
+			break;
+		}
+		resid = 0;
+	}
+	clear_bit(field, &mp->m_icsb_counters);
+	xfs_icsb_unlock_all_counters(mp);
+}
+
+STATIC void
+xfs_icsb_sync_counters_int(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	xfs_icsb_cnts_t	cnt;
+	int		s;
+
+	/* Pass 1: lock all counters */
+	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
+		s = XFS_SB_LOCK(mp);
+
+	xfs_icsb_count(mp, &cnt, flags);
+
+	/* Step 3: update mp->m_sb fields */
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
+		mp->m_sb.sb_icount = cnt.icsb_icount;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
+		mp->m_sb.sb_ifree = cnt.icsb_ifree;
+	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
+		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
+
+	if ((flags & XFS_ICSB_SB_LOCKED) == 0)
+		XFS_SB_UNLOCK(mp, s);
+}
+
+/*
+ * Accurate update of per-cpu counters to incore superblock
+ */
+STATIC void
+xfs_icsb_sync_counters(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_sync_counters_int(mp, 0);
+}
+
+/*
+ * lazy addition used for things like df, background sb syncs, etc
+ */
+void
+xfs_icsb_sync_counters_lazy(
+	xfs_mount_t	*mp)
+{
+	xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT);
+}
+
+/*
+ * Balance and enable/disable counters as necessary.
+ *
+ * Thresholds for re-enabling counters are somewhat magic.
+ * inode counts are chosen to be the same number as single
+ * on disk allocation chunk per CPU, and free blocks is
+ * something far enough zero that we aren't going thrash
+ * when we get near ENOSPC.
+ */
+#define XFS_ICSB_INO_CNTR_REENABLE	64
+#define XFS_ICSB_FDBLK_CNTR_REENABLE	512
+STATIC void
+xfs_icsb_balance_counter(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t  field,
+	int		flags)
+{
+	uint64_t	count, resid = 0;
+	int		weight = num_online_cpus();
+	int		s;
+
+	if (!(flags & XFS_ICSB_SB_LOCKED))
+		s = XFS_SB_LOCK(mp);
+
+	/* disable counter and sync counter */
+	xfs_icsb_disable_counter(mp, field);
+
+	/* update counters  - first CPU gets residual*/
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		count = mp->m_sb.sb_icount;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_INO_CNTR_REENABLE)
+			goto out;
+		break;
+	case XFS_SBS_IFREE:
+		count = mp->m_sb.sb_ifree;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_INO_CNTR_REENABLE)
+			goto out;
+		break;
+	case XFS_SBS_FDBLOCKS:
+		count = mp->m_sb.sb_fdblocks;
+		resid = do_div(count, weight);
+		if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
+			goto out;
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	xfs_icsb_enable_counter(mp, field, count, resid);
+out:
+	if (!(flags & XFS_ICSB_SB_LOCKED))
+		XFS_SB_UNLOCK(mp, s);
+}
+
+STATIC int
+xfs_icsb_modify_counters_int(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd,
+	int		flags)
+{
+	xfs_icsb_cnts_t	*icsbp;
+	long long	lcounter;	/* long counter for 64 bit fields */
+	int		cpu, s, locked = 0;
+	int		ret = 0, balance_done = 0;
+
+again:
+	cpu = get_cpu();
+	icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu),
+	spin_lock(&icsbp->icsb_lock);
+	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
+		goto slow_path;
+
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = icsbp->icsb_icount;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_icount = lcounter;
+		break;
+
+	case XFS_SBS_IFREE:
+		lcounter = icsbp->icsb_ifree;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_ifree = lcounter;
+		break;
+
+	case XFS_SBS_FDBLOCKS:
+		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
+
+		lcounter = icsbp->icsb_fdblocks;
+		lcounter += delta;
+		if (unlikely(lcounter < 0))
+			goto slow_path;
+		icsbp->icsb_fdblocks = lcounter;
+		break;
+	default:
+		BUG();
+		break;
+	}
+	spin_unlock(&icsbp->icsb_lock);
+	put_cpu();
+	if (locked)
+		XFS_SB_UNLOCK(mp, s);
+	return 0;
+
+	/*
+	 * The slow path needs to be run with the SBLOCK
+	 * held so that we prevent other threads from
+	 * attempting to run this path at the same time.
+	 * this provides exclusion for the balancing code,
+	 * and exclusive fallback if the balance does not
+	 * provide enough resources to continue in an unlocked
+	 * manner.
+	 */
+slow_path:
+	spin_unlock(&icsbp->icsb_lock);
+	put_cpu();
+
+	/* need to hold superblock incase we need
+	 * to disable a counter */
+	if (!(flags & XFS_ICSB_SB_LOCKED)) {
+		s = XFS_SB_LOCK(mp);
+		locked = 1;
+		flags |= XFS_ICSB_SB_LOCKED;
+	}
+	if (!balance_done) {
+		xfs_icsb_balance_counter(mp, field, flags);
+		balance_done = 1;
+		goto again;
+	} else {
+		/*
+		 * we might not have enough on this local
+		 * cpu to allocate for a bulk request.
+		 * We need to drain this field from all CPUs
+		 * and disable the counter fastpath
+		 */
+		xfs_icsb_disable_counter(mp, field);
+	}
+
+	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+
+	if (locked)
+		XFS_SB_UNLOCK(mp, s);
+	return ret;
+}
+
+STATIC int
+xfs_icsb_modify_counters(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd)
+{
+	return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0);
+}
+
+/*
+ * Called when superblock is already locked
+ */
+STATIC int
+xfs_icsb_modify_counters_locked(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int		delta,
+	int		rsvd)
+{
+	return xfs_icsb_modify_counters_int(mp, field, delta,
+						rsvd, XFS_ICSB_SB_LOCKED);
+}
+#endif