xfs: when replaying bmap operations, don't let unlinked inodes get reaped

Log recovery will iget an inode to replay BUI items and iput the inode
when it's done.  Unfortunately, if the inode was unlinked, the iput
will see that i_nlink == 0 and decide to truncate & free the inode,
which prevents us from replaying subsequent BUIs.  We can't skip the
BUIs because we have to replay all the redo items to ensure that
atomic operations complete.

Since unlinked inode recovery will reap the inode anyway, we can
safely introduce a new inode flag to indicate that an inode is in this
'unlinked recovery' state and should not be auto-reaped in the
drop_inode path.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 02e8f93..9bf57c7 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -456,6 +456,8 @@
 	if (error)
 		goto err_inode;
 
+	if (VFS_I(ip)->i_nlink == 0)
+		xfs_iflags_set(ip, XFS_IRECOVERY);
 	xfs_defer_init(&dfops, &firstfsb);
 
 	/* Process deferred bmap item. */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e08eaea..edb453d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1850,6 +1850,7 @@
 	}
 
 	mp = ip->i_mount;
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a8658e6..4be3203 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -222,6 +222,12 @@
 #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 #define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */
 #define XFS_IEOFBLOCKS		(1 << 10)/* has the preallocblocks tag set */
+/*
+ * If this unlinked inode is in the middle of recovery, don't let drop_inode
+ * truncate and free the inode.  This can happen if we iget the inode during
+ * log recovery to replay a bmap operation on the inode.
+ */
+#define XFS_IRECOVERY		(1 << 11)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9697e94..9b3d7c7 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4969,6 +4969,7 @@
 	if (error)
 		goto fail_iput;
 
+	xfs_iflags_clear(ip, XFS_IRECOVERY);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3f64615..a78a116 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -925,6 +925,15 @@
 	}
 
 	/*
+	 * During the second phase of log recovery, we need iget and
+	 * iput to behave like they do for an active filesystem.
+	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+	 * of inodes before we're done replaying log items on those
+	 * inodes.
+	 */
+	mp->m_super->s_flags |= MS_ACTIVE;
+
+	/*
 	 * Finish recovering the file system.  This part needed to be delayed
 	 * until after the root and real-time bitmap inodes were consistently
 	 * read in.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 204b794..35c04a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1008,6 +1008,16 @@
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 
+	/*
+	 * If this unlinked inode is in the middle of recovery, don't
+	 * drop the inode just yet; log recovery will take care of
+	 * that.  See the comment for this inode flag.
+	 */
+	if (ip->i_flags & XFS_IRECOVERY) {
+		ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+		return 0;
+	}
+
 	return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
 }