GFS2: Fix inode deallocation race
This area of the code has always been a bit delicate due to the
subtleties of lock ordering. The problem is that for "normal"
alloc/dealloc, we always grab the inode locks first and the rgrp lock
later.
In order to ensure no races in looking up the unlinked, but still
allocated inodes, we need to hold the rgrp lock when we do the lookup,
which means that we can't take the inode glock.
The solution is to borrow the technique already used by NFS to solve
what is essentially the same problem (given an inode number, look up
the inode carefully, checking that it really is in the expected
state).
We cannot do that directly from the allocation code (lock ordering
again) so we give the job to the pre-existing delete workqueue and
carry on with the allocation as normal.
If we find there is no space, we do a journal flush (required anyway
if space from a deallocation is to be released) which should block
against the pending deallocations, so we should always get the space
back.
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bef3ab6..33c8407 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -963,17 +963,18 @@
* The inode, if one has been found, in inode.
*/
-static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
- u64 skip)
+static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
{
u32 goal = 0, block;
u64 no_addr;
struct gfs2_sbd *sdp = rgd->rd_sbd;
unsigned int n;
+ struct gfs2_glock *gl;
+ struct gfs2_inode *ip;
+ int error;
+ int found = 0;
- for(;;) {
- if (goal >= rgd->rd_data)
- break;
+ while (goal < rgd->rd_data) {
down_write(&sdp->sd_log_flush_lock);
n = 1;
block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
@@ -990,11 +991,32 @@
if (no_addr == skip)
continue;
*last_unlinked = no_addr;
- return no_addr;
+
+ error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl);
+ if (error)
+ continue;
+
+ /* If the inode is already in cache, we can ignore it here
+ * because the existing inode disposal code will deal with
+ * it when all refs have gone away. Accessing gl_object like
+ * this is not safe in general. Here it is ok because we do
+ * not dereference the pointer, and we only need an approx
+ * answer to whether it is NULL or not.
+ */
+ ip = gl->gl_object;
+
+ if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+ gfs2_glock_put(gl);
+ else
+ found++;
+
+ /* Limit reclaim to sensible number of tasks */
+ if (found > 2*NR_CPUS)
+ return;
}
rgd->rd_flags &= ~GFS2_RDF_CHECK;
- return 0;
+ return;
}
/**
@@ -1075,11 +1097,9 @@
* Try to acquire rgrp in way which avoids contending with others.
*
* Returns: errno
- * unlinked: the block address of an unlinked block to be reclaimed
*/
-static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked,
- u64 *last_unlinked)
+static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd, *begin = NULL;
@@ -1089,7 +1109,6 @@
int loops = 0;
int error, rg_locked;
- *unlinked = 0;
rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
@@ -1106,17 +1125,10 @@
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- /* If the rg came in already locked, there's no
- way we can recover from a failed try_rgrp_unlink
- because that would require an iput which can only
- happen after the rgrp is unlocked. */
- if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
- *unlinked = try_rgrp_unlink(rgd, last_unlinked,
- ip->i_no_addr);
+ if (rgd->rd_flags & GFS2_RDF_CHECK)
+ try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (*unlinked)
- return -EAGAIN;
/* fall through */
case GLR_TRYFAILED:
rgd = recent_rgrp_next(rgd);
@@ -1145,13 +1157,10 @@
case 0:
if (try_rgrp_fit(rgd, al))
goto out;
- if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK)
- *unlinked = try_rgrp_unlink(rgd, last_unlinked,
- ip->i_no_addr);
+ if (rgd->rd_flags & GFS2_RDF_CHECK)
+ try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
if (!rg_locked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (*unlinked)
- return -EAGAIN;
break;
case GLR_TRYFAILED:
@@ -1204,12 +1213,12 @@
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
int error = 0;
- u64 last_unlinked = NO_BLOCK, unlinked;
+ u64 last_unlinked = NO_BLOCK;
+ int tries = 0;
if (gfs2_assert_warn(sdp, al->al_requested))
return -EINVAL;
-try_again:
if (hold_rindex) {
/* We need to hold the rindex unless the inode we're using is
the rindex itself, in which case it's already held. */
@@ -1218,31 +1227,23 @@
else if (!sdp->sd_rgrps) /* We may not have the rindex read
in, so: */
error = gfs2_ri_update_special(ip);
+ if (error)
+ return error;
}
- if (error)
- return error;
+ do {
+ error = get_local_rgrp(ip, &last_unlinked);
+ /* If there is no space, flushing the log may release some */
+ if (error)
+ gfs2_log_flush(sdp, NULL);
+ } while (error && tries++ < 3);
- /* Find an rgrp suitable for allocation. If it encounters any unlinked
- dinodes along the way, error will equal -EAGAIN and unlinked will
- contains it block address. We then need to look up that inode and
- try to free it, and try the allocation again. */
- error = get_local_rgrp(ip, &unlinked, &last_unlinked);
if (error) {
if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
- if (error != -EAGAIN)
- return error;
-
- gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
- /* regardless of whether or not gfs2_process_unlinked_inode
- was successful, we don't want to repeat it again. */
- last_unlinked = unlinked;
- gfs2_log_flush(sdp, NULL);
- error = 0;
-
- goto try_again;
+ return error;
}
+
/* no error, so we have the rgrp set in the inode's allocation. */
al->al_file = file;
al->al_line = line;