GFS2: Resolve inode eviction and ail list interaction bug This patch contains a few misc fixes which resolve a recently reported issue. This patch has been a real team effort and has received a lot of testing. The first issue is that the ail lock needs to be held over a few more operations. The lock thats added into gfs2_releasepage() may possibly be a candidate for replacing with RCU at some future point, but at this stage we've gone for the obvious fix. The second issue is that gfs2_write_inode() can end up calling a glock recursively when called from gfs2_evict_inode() via the syncing code, so it needs a guard added. The third issue is that we either need to not truncate the metadata pages of inodes which have zero link count, but which we cannot deallocate due to them still being in use by other nodes, or we need to ensure that those pages have all made it through the journal and ail lists first. This patch takes the former approach, but the latter has also been tested and there is nothing to choose between them performance-wise. So again, we could revise that decision in the future. Also, the inode eviction process is now better documented. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> Tested-by: Bob Peterson <rpeterso@redhat.com> Tested-by: Abhijith Das <adas@redhat.com> Reported-by: Barry J. Marson <bmarson@redhat.com> Reported-by: David Teigland <teigland@redhat.com>

commit: 380f7c65a7eb3288e4b6812acf3474a1de230707 [log] [tgz]
author: Steven Whitehouse <swhiteho@redhat.com> Thu Jul 14 08:59:44 2011 +0100
committer: Steven Whitehouse <swhiteho@redhat.com> Thu Jul 14 08:59:44 2011 +0100
tree: 7abe4b4ce390afc6b4d2bc7cae0ec298fe2efa3c
parent: 3942ae5319640ced5844b75f44884e4bcb8a2f16 [diff]
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 802ac5e..f9fbbe9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c

@@ -1069,6 +1069,7 @@
 		return 0;
 
 	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 	head = bh = page_buffers(page);
 	do {
 		if (atomic_read(&bh->b_count))
@@ -1080,6 +1081,7 @@
 			goto not_possible;
 		bh = bh->b_this_page;
 	} while(bh != head);
+	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 
 	head = bh = page_buffers(page);
@@ -1112,6 +1114,7 @@
 	WARN_ON(buffer_dirty(bh));
 	WARN_ON(buffer_pinned(bh));
 cannot_release:
+	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 	return 0;
 }

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 712b722..2cca2931 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c

@@ -47,10 +47,10 @@
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
 		gfs2_remove_from_ail(bd);
-		spin_unlock(&sdp->sd_ail_lock);
-
 		bd->bd_bh = NULL;
 		bh->b_private = NULL;
+		spin_unlock(&sdp->sd_ail_lock);
+
 		bd->bd_blkno = bh->b_blocknr;
 		gfs2_log_lock(sdp);
 		gfs2_assert_withdraw(sdp, !buffer_busy(bh));

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 903115f..85c6292 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c

@@ -903,6 +903,7 @@
 		if (gfs2_ail1_empty(sdp))
 			break;
 	}
+	gfs2_log_flush(sdp, NULL);
 }
 
 static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ed540e7..fb0edf7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c

@@ -757,13 +757,17 @@
 	struct timespec atime;
 	struct gfs2_dinode *di;
 	int ret = -EAGAIN;
+	int unlock_required = 0;
 
 	/* Skip timestamp update, if this is from a memalloc */
 	if (current->flags & PF_MEMALLOC)
 		goto do_flush;
-	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (ret)
-		goto do_flush;
+	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
+		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+		if (ret)
+			goto do_flush;
+		unlock_required = 1;
+	}
 	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
 	if (ret)
 		goto do_unlock;
@@ -780,7 +784,8 @@
 	}
 	gfs2_trans_end(sdp);
 do_unlock:
-	gfs2_glock_dq_uninit(&gh);
+	if (unlock_required)
+		gfs2_glock_dq_uninit(&gh);
 do_flush:
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
@@ -1427,7 +1432,20 @@
 	return error;
 }
 
-/*
+/**
+ * gfs2_evict_inode - Remove an inode from cache
+ * @inode: The inode to evict
+ *
+ * There are three cases to consider:
+ * 1. i_nlink == 0, we are final opener (and must deallocate)
+ * 2. i_nlink == 0, we are not the final opener (and cannot deallocate)
+ * 3. i_nlink > 0
+ *
+ * If the fs is read only, then we have to treat all cases as per #3
+ * since we are unable to do any deallocation. The inode will be
+ * deallocated by the next read/write node to attempt an allocation
+ * in the same resource group
+ *
  * We have to (at the moment) hold the inodes main lock to cover
  * the gap between unlocking the shared lock on the iopen lock and
  * taking the exclusive lock. I'd rather do a shared -> exclusive
@@ -1470,6 +1488,8 @@
 	if (error)
 		goto out_truncate;
 
+	/* Case 1 starts here */
+
 	if (S_ISDIR(inode->i_mode) &&
 	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
 		error = gfs2_dir_exhash_dealloc(ip);
@@ -1493,13 +1513,16 @@
 	goto out_unlock;
 
 out_truncate:
+	/* Case 2 starts here */
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
 		goto out_unlock;
-	gfs2_final_release_pages(ip);
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
 	gfs2_trans_end(sdp);
 
 out_unlock:
+	/* Error path for case 1 */
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
@@ -1507,6 +1530,7 @@
 	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
 out:
+	/* Case 3 starts here */
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
commit	380f7c65a7eb3288e4b6812acf3474a1de230707	[log] [tgz]
author	Steven Whitehouse <swhiteho@redhat.com>	Thu Jul 14 08:59:44 2011 +0100
committer	Steven Whitehouse <swhiteho@redhat.com>	Thu Jul 14 08:59:44 2011 +0100
tree	7abe4b4ce390afc6b4d2bc7cae0ec298fe2efa3c
parent	3942ae5319640ced5844b75f44884e4bcb8a2f16 [diff]