Merge branch 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6

* 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6:
  [XFS] Avoid replaying inode buffer initialisation log items if on-disk version is newer.
  [XFS] Ensure file size updates have been completed before writing inode to disk.
  [XFS] On-demand reaping of the MRU cache
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d9c40fe..5f152f6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -181,6 +181,7 @@
 		ip->i_d.di_size = isize;
 		ip->i_update_core = 1;
 		ip->i_update_size = 1;
+		mark_inode_dirty_sync(vn_to_inode(ioend->io_vnode));
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4528f9a..491d1f4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -415,8 +415,10 @@
 
 	if (vp) {
 		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-		if (sync)
+		if (sync) {
+			filemap_fdatawait(inode->i_mapping);
 			flags |= FLUSH_SYNC;
+		}
 		error = bhv_vop_iflush(vp, flags);
 		if (error == EAGAIN)
 			error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index d7e1361..fa25b7d 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -52,6 +52,11 @@
 #define	XFS_BLI_UDQUOT_BUF	0x4
 #define XFS_BLI_PDQUOT_BUF	0x8
 #define	XFS_BLI_GDQUOT_BUF	0x10
+/*
+ * This flag indicates that the buffer contains newly allocated
+ * inodes.
+ */
+#define	XFS_BLI_INODE_NEW_BUF	0x20
 
 #define	XFS_BLI_CHUNK		128
 #define	XFS_BLI_SHIFT		7
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index ce22786..16f8e17 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -467,8 +467,7 @@
 xfs_filestream_flush(
 	xfs_mount_t	*mp)
 {
-	/* point in time flush, so keep the reaper running */
-	xfs_mru_cache_flush(mp->m_filestream, 1);
+	xfs_mru_cache_flush(mp->m_filestream);
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ae6e8e..dacb197 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1874,6 +1874,7 @@
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
@@ -1884,6 +1885,50 @@
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
+	int			stale_buf = 1;
+
+	/*
+	 * Scan through the on-disk inode buffer and attempt to
+	 * determine if it has been written to since it was logged.
+	 *
+	 * - If any of the magic numbers are incorrect then the buffer is stale
+	 * - If any of the modes are non-zero then the buffer is not stale
+	 * - If all of the modes are zero and at least one of the generation
+	 *   counts is non-zero then the buffer is stale
+	 *
+	 * If the end result is a stale buffer then the log buffer is replayed
+	 * otherwise it is skipped.
+	 *
+	 * This heuristic is not perfect.  It can be improved by scanning the
+	 * entire inode chunk for evidence that any of the inode clusters have
+	 * been updated.  To fix this problem completely we will need a major
+	 * architectural change to the logging system.
+	 */
+	if (buf_f->blf_flags & XFS_BLI_INODE_NEW_BUF) {
+		xfs_dinode_t    *dip;
+		int             inodes_per_buf;
+		int		mode_count = 0;
+		int		gen_count = 0;
+
+		stale_buf = 0;
+		inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+		for (i = 0; i < inodes_per_buf; i++) {
+			dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+				i * mp->m_sb.sb_inodesize);
+			if (be16_to_cpu(dip->di_core.di_magic) !=
+					XFS_DINODE_MAGIC) {
+				stale_buf = 1;
+				break;
+			}
+			if (be16_to_cpu(dip->di_core.di_mode))
+				mode_count++;
+			if (be16_to_cpu(dip->di_core.di_gen))
+				gen_count++;
+		}
+
+		if (!mode_count && gen_count)
+			stale_buf = 1;
+	}
 
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
@@ -1917,7 +1962,7 @@
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
-		if (!error)
+		if (!error && stale_buf)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
@@ -2089,7 +2134,7 @@
 	if (log->l_quotaoffs_flag & type)
 		return;
 
-	xlog_recover_do_reg_buffer(item, bp, buf_f);
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 
 /*
@@ -2190,7 +2235,7 @@
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
-		xlog_recover_do_reg_buffer(item, bp, buf_f);
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 7deb9e3..e0b358c 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -206,8 +206,11 @@
 	 */
 	if (!_xfs_mru_cache_migrate(mru, now)) {
 		mru->time_zero = now;
-		if (!mru->next_reap)
-			mru->next_reap = mru->grp_count * mru->grp_time;
+		if (!mru->queued) {
+			mru->queued = 1;
+			queue_delayed_work(xfs_mru_reap_wq, &mru->work,
+			                   mru->grp_count * mru->grp_time);
+		}
 	} else {
 		grp = (now - mru->time_zero) / mru->grp_time;
 		grp = (mru->lru_grp + grp) % mru->grp_count;
@@ -271,29 +274,26 @@
 	struct work_struct	*work)
 {
 	xfs_mru_cache_t		*mru = container_of(work, xfs_mru_cache_t, work.work);
-	unsigned long		now;
+	unsigned long		now, next;
 
 	ASSERT(mru && mru->lists);
 	if (!mru || !mru->lists)
 		return;
 
 	mutex_spinlock(&mru->lock);
-	now = jiffies;
-	if (mru->reap_all ||
-	    (mru->next_reap && time_after(now, mru->next_reap))) {
-		if (mru->reap_all)
-			now += mru->grp_count * mru->grp_time * 2;
-		mru->next_reap = _xfs_mru_cache_migrate(mru, now);
-		_xfs_mru_cache_clear_reap_list(mru);
+	next = _xfs_mru_cache_migrate(mru, jiffies);
+	_xfs_mru_cache_clear_reap_list(mru);
+
+	mru->queued = next;
+	if ((mru->queued > 0)) {
+		now = jiffies;
+		if (next <= now)
+			next = 0;
+		else
+			next -= now;
+		queue_delayed_work(xfs_mru_reap_wq, &mru->work, next);
 	}
 
-	/*
-	 * the process that triggered the reap_all is responsible
-	 * for restating the periodic reap if it is required.
-	 */
-	if (!mru->reap_all)
-		queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
-	mru->reap_all = 0;
 	mutex_spinunlock(&mru->lock, 0);
 }
 
@@ -352,7 +352,7 @@
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
-	mru->lists = kmem_alloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
+	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
 
 	if (!mru->lists) {
 		err = ENOMEM;
@@ -374,11 +374,6 @@
 	mru->grp_time  = grp_time;
 	mru->free_func = free_func;
 
-	/* start up the reaper event */
-	mru->next_reap = 0;
-	mru->reap_all = 0;
-	queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
-
 	*mrup = mru;
 
 exit:
@@ -394,35 +389,25 @@
  * Call xfs_mru_cache_flush() to flush out all cached entries, calling their
  * free functions as they're deleted.  When this function returns, the caller is
  * guaranteed that all the free functions for all the elements have finished
- * executing.
- *
- * While we are flushing, we stop the periodic reaper event from triggering.
- * Normally, we want to restart this periodic event, but if we are shutting
- * down the cache we do not want it restarted. hence the restart parameter
- * where 0 = do not restart reaper and 1 = restart reaper.
+ * executing and the reaper is not running.
  */
 void
 xfs_mru_cache_flush(
-	xfs_mru_cache_t		*mru,
-	int			restart)
+	xfs_mru_cache_t		*mru)
 {
 	if (!mru || !mru->lists)
 		return;
 
-	cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
-
 	mutex_spinlock(&mru->lock);
-	mru->reap_all = 1;
-	mutex_spinunlock(&mru->lock, 0);
+	if (mru->queued) {
+		mutex_spinunlock(&mru->lock, 0);
+		cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+		mutex_spinlock(&mru->lock);
+	}
 
-	queue_work(xfs_mru_reap_wq, &mru->work.work);
-	flush_workqueue(xfs_mru_reap_wq);
+	_xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time);
+	_xfs_mru_cache_clear_reap_list(mru);
 
-	mutex_spinlock(&mru->lock);
-	WARN_ON_ONCE(mru->reap_all != 0);
-	mru->reap_all = 0;
-	if (restart)
-		queue_delayed_work(xfs_mru_reap_wq, &mru->work, mru->grp_time);
 	mutex_spinunlock(&mru->lock, 0);
 }
 
@@ -433,8 +418,7 @@
 	if (!mru || !mru->lists)
 		return;
 
-	/* we don't want the reaper to restart here */
-	xfs_mru_cache_flush(mru, 0);
+	xfs_mru_cache_flush(mru);
 
 	kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
 	kmem_free(mru, sizeof(*mru));
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 624fd10..dd58ea1 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -32,11 +32,9 @@
 	unsigned int		grp_time;  /* Time period spanned by grps.  */
 	unsigned int		lru_grp;   /* Group containing time zero.   */
 	unsigned long		time_zero; /* Time first element was added. */
-	unsigned long		next_reap; /* Time that the reaper should
-					      next do something. */
-	unsigned int		reap_all;  /* if set, reap all lists */
 	xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
 	struct delayed_work	work;      /* Workqueue data for reaping.   */
+	unsigned int		queued;	   /* work has been queued */
 } xfs_mru_cache_t;
 
 int xfs_mru_cache_init(void);
@@ -44,7 +42,7 @@
 int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
 			     unsigned int grp_count,
 			     xfs_mru_cache_free_func_t free_func);
-void xfs_mru_cache_flush(xfs_mru_cache_t *mru, int restart);
+void xfs_mru_cache_flush(xfs_mru_cache_t *mru);
 void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
 int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 				void *value);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b89..95fff68 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -966,6 +966,7 @@
 	ASSERT(atomic_read(&bip->bli_refcount) > 0);
 
 	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+	bip->bli_format.blf_flags |= XFS_BLI_INODE_NEW_BUF;
 }
 
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1a5ad8c..6034592 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1082,6 +1082,9 @@
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
+	if (flag & FSYNC_DATA)
+		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+
 	/*
 	 * We always need to make sure that the required inode state
 	 * is safe on disk.  The vnode might be clean but because
@@ -3769,12 +3772,16 @@
 			sync_lsn = log->l_last_sync_lsn;
 			GRANT_UNLOCK(log, s);
 
-			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
-				return 0;
+			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
+				if (flags & FLUSH_SYNC)
+					log_flags |= XFS_LOG_SYNC;
+				error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
+				if (error)
+					return error;
+			}
 
-			if (flags & FLUSH_SYNC)
-				log_flags |= XFS_LOG_SYNC;
-			return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
+			if (ip->i_update_core == 0)
+				return 0;
 		}
 	}
 
@@ -3788,9 +3795,6 @@
 	if (flags & FLUSH_INODE) {
 		int	flush_flags;
 
-		if (xfs_ipincount(ip))
-			return EAGAIN;
-
 		if (flags & FLUSH_SYNC) {
 			xfs_ilock(ip, XFS_ILOCK_SHARED);
 			xfs_iflock(ip);