xfs: di_flushiter considered harmful

When we made all inode updates transactional, we no longer needed
the log recovery detection for inodes being newer on disk than the
transaction being replayed - it was redundant as replay of the log
would always result in the latest version of the inode would be on
disk. It was redundant, but left in place because it wasn't
considered to be a problem.

However, with the new "don't read inodes on create" optimisation,
flushiter has come back to bite us. Essentially, the optimisation
made always initialises flushiter to zero in the create transaction,
and so if we then crash and run recovery and the inode already on
disk has a non-zero flushiter it will skip recovery of that inode.
As a result, log recovery does the wrong thing and we end up with a
corrupt filesystem.

Because we have to support old kernel to new kernel upgrades, we
can't just get rid of the flushiter support in log recovery as we
might be upgrading from a kernel that doesn't have fully transactional
inode updates.  Unfortunately, for v4 superblocks there is no way to
guarantee that log recovery knows about this fact.

We cannot add a new inode format flag to say it's a "special inode
create" because it won't be understood by older kernels and so
recovery could do the wrong thing on downgrade. We cannot specially
detect the combination of zero mode/non-zero flushiter on disk to
non-zero mode, zero flushiter in the log item during recovery
because wrapping of the flushiter can result in false detection.

Hence that makes this "don't use flushiter" optimisation limited to
a disk format that guarantees that we don't need it. And that means
the only fix here is to limit the "no read IO on create"
optimisation to version 5 superblocks....

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>

diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 07d735a..e5869b5 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@
  * There is a very similar struct icdinode in xfs_inode which matches the
  * layout of the first 96 bytes of this structure, but is kept in native
  * format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
  */
 typedef struct xfs_dinode {
 	__be16		di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b78481f..bb262c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@
 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
-	to->di_flushiter = cpu_to_be16(from->di_flushiter);
 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@
 		to->di_lsn = cpu_to_be64(from->di_lsn);
 		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 		uuid_copy(&to->di_uuid, &from->di_uuid);
+		to->di_flushiter = 0;
+	} else {
+		to->di_flushiter = cpu_to_be16(from->di_flushiter);
 	}
 }
 
@@ -1029,10 +1031,14 @@
 /*
  * Read the disk inode attributes into the in-core inode structure.
  *
- * If we are initialising a new inode and we are not utilising the
- * XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new inode core
- * with a random generation number. If we are keeping inodes around, we need to
- * read the inode cluster to get the existing generation number off disk.
+ * For version 5 superblocks, if we are initialising a new inode and we are not
+ * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
+ * inode core with a random generation number. If we are keeping inodes around,
+ * we need to read the inode cluster to get the existing generation number off
+ * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
+ * format) then log recovery is dependent on the di_flushiter field being
+ * initialised from the current on-disk value and hence we must also read the
+ * inode off disk.
  */
 int
 xfs_iread(
@@ -1054,6 +1060,7 @@
 
 	/* shortcut IO on inode allocation if possible */
 	if ((iget_flags & XFS_IGET_CREATE) &&
+	    xfs_sb_version_hascrc(&mp->m_sb) &&
 	    !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 		/* initialise the on-disk inode core */
 		memset(&ip->i_d, 0, sizeof(ip->i_d));
@@ -2882,12 +2889,18 @@
 			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
 		goto corrupt_out;
 	}
+
 	/*
-	 * bump the flush iteration count, used to detect flushes which
-	 * postdate a log record during recovery. This is redundant as we now
-	 * log every change and hence this can't happen. Still, it doesn't hurt.
+	 * Inode item log recovery for v1/v2 inodes are dependent on the
+	 * di_flushiter count for correct sequencing. We bump the flush
+	 * iteration count so we can detect flushes which postdate a log record
+	 * during recovery. This is redundant as we now log every change and
+	 * hence this can't happen but we need to still do it to ensure
+	 * backwards compatibility with old kernels that predate logging all
+	 * inode changes.
 	 */
-	ip->i_d.di_flushiter++;
+	if (ip->i_d.di_version < 3)
+		ip->i_d.di_flushiter++;
 
 	/*
 	 * Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6fcc910a..7681b19 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2592,8 +2592,16 @@
 		goto error;
 	}
 
-	/* Skip replay when the on disk inode is newer than the log one */
-	if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+	/*
+	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+	 * are transactional and if ordering is necessary we can determine that
+	 * more accurately by the LSN field in the V3 inode core. Don't trust
+	 * the inode versions we might be changing them here - use the
+	 * superblock flag to determine whether we need to look at di_flushiter
+	 * to skip replay when the on disk inode is newer than the log one
+	 */
+	if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+	    dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
 		/*
 		 * Deal with the wrap case, DI_MAX_FLUSH is less
 		 * than smaller numbers
@@ -2608,6 +2616,7 @@
 			goto error;
 		}
 	}
+
 	/* Take the opportunity to reset the flush iteration count */
 	dicp->di_flushiter = 0;