[XFS] Start writeout earlier (on last close) in the case where we have a
truncate down followed by delayed allocation (buffered writes) - worst
case scenario for the notorious NULL files problem. This reduces the
window where we are exposed to that problem significantly.
SGI-PV: 917976
SGI-Modid: xfs-linux-melb:xfs-kern:26100a
Signed-off-by: Nathan Scott <nathans@sgi.com>
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 5835e69..c0a9043 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1157,6 +1157,18 @@
return error;
}
+STATIC int
+xfs_vm_writepages(
+ struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct vnode *vp = vn_from_inode(mapping->host);
+
+ if (VN_TRUNC(vp))
+ VUNTRUNCATE(vp);
+ return generic_writepages(mapping, wbc);
+}
+
/*
* Called to move a page into cleanable state - and from there
* to be released. Possibly the page is already clean. We always
@@ -1451,6 +1463,7 @@
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
.writepage = xfs_vm_writepage,
+ .writepages = xfs_vm_writepages,
.sync_page = block_sync_page,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 7c9f759..97615cc 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -324,6 +324,17 @@
}
STATIC int
+xfs_file_close(
+ struct file *filp)
+{
+ vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode);
+ int error;
+
+ VOP_CLOSE(vp, 0, file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL, error);
+ return -error;
+}
+
+STATIC int
xfs_file_release(
struct inode *inode,
struct file *filp)
@@ -349,6 +360,8 @@
if (datasync)
flags |= FSYNC_DATA;
+ if (VN_TRUNC(vp))
+ VUNTRUNCATE(vp);
VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
return -error;
}
@@ -578,6 +591,7 @@
#endif
.mmap = xfs_file_mmap,
.open = xfs_file_open,
+ .flush = xfs_file_close,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
#ifdef HAVE_FOP_OPEN_EXEC
@@ -602,6 +616,7 @@
#endif
.mmap = xfs_file_mmap,
.open = xfs_file_open,
+ .flush = xfs_file_close,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 575f2a7..f0c56da 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -15,40 +15,12 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-
#include "xfs.h"
-/*
- * Stub for no-op vnode operations that return error status.
- */
-int
-fs_noerr(void)
-{
- return 0;
-}
+int fs_noerr(void) { return 0; }
+int fs_nosys(void) { return ENOSYS; }
+void fs_noval(void) { return; }
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
- return ENOSYS;
-}
-
-/*
- * Stub for inactive, strategy, and read/write lock/unlock. Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
void
fs_tosspages(
bhv_desc_t *bdp,
@@ -63,11 +35,6 @@
truncate_inode_pages(ip->i_mapping, first);
}
-
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
void
fs_flushinval_pages(
bhv_desc_t *bdp,
@@ -79,16 +46,13 @@
struct inode *ip = vn_to_inode(vp);
if (VN_CACHED(vp)) {
+ if (VN_TRUNC(vp))
+ VUNTRUNCATE(vp);
filemap_write_and_wait(ip->i_mapping);
-
truncate_inode_pages(ip->i_mapping, first);
}
}
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
int
fs_flush_pages(
bhv_desc_t *bdp,
@@ -100,12 +64,13 @@
vnode_t *vp = BHV_TO_VNODE(bdp);
struct inode *ip = vn_to_inode(vp);
- if (VN_CACHED(vp)) {
+ if (VN_DIRTY(vp)) {
+ if (VN_TRUNC(vp))
+ VUNTRUNCATE(vp);
filemap_fdatawrite(ip->i_mapping);
if (flags & XFS_B_ASYNC)
return 0;
filemap_fdatawait(ip->i_mapping);
}
-
return 0;
}
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index a64b7db..569a4e7 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -56,12 +56,18 @@
typedef struct dentry vname_t;
typedef bhv_head_t vn_bhv_head_t;
+typedef enum vflags {
+ VMODIFIED = 0x08, /* XFS inode state possibly differs */
+ /* to the Linux inode state. */
+ VTRUNCATED = 0x40, /* truncated down so flush-on-close */
+} vflags_t;
+
/*
* MP locking protocols:
* v_flag, v_vfsp VN_LOCK/VN_UNLOCK
*/
typedef struct vnode {
- __u32 v_flag; /* vnode flags (see below) */
+ vflags_t v_flag; /* vnode flags (see above) */
struct vfs *v_vfsp; /* ptr to containing VFS */
vnumber_t v_number; /* in-core vnode number */
vn_bhv_head_t v_bh; /* behavior head */
@@ -126,12 +132,6 @@
}
/*
- * Vnode flags.
- */
-#define VMODIFIED 0x8 /* XFS inode state possibly differs */
- /* to the Linux inode state. */
-
-/*
* Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
*/
typedef enum vrwlock {
@@ -162,8 +162,10 @@
VCHANGE_FLAGS_IOEXCL_COUNT = 4
} vchange_t;
+typedef enum { L_FALSE, L_TRUE } lastclose_t;
typedef int (*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef int (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *);
typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
const struct iovec *, unsigned int,
loff_t *, int, struct cred *);
@@ -234,6 +236,7 @@
typedef struct vnodeops {
bhv_position_t vn_position; /* position within behavior chain */
vop_open_t vop_open;
+ vop_close_t vop_close;
vop_read_t vop_read;
vop_write_t vop_write;
vop_sendfile_t vop_sendfile;
@@ -278,6 +281,10 @@
*/
#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op)
+#define VOP_OPEN(vp, cr, rv) \
+ rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
+#define VOP_CLOSE(vp, f, last, cr, rv) \
+ rv = _VOP_(vop_close, vp)((vp)->v_fbhv, f, last, cr)
#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \
rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \
@@ -290,8 +297,6 @@
rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \
rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv) \
- rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
#define VOP_GETATTR(vp, vap, f, cr, rv) \
rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
#define VOP_SETATTR(vp, vap, f, cr, rv) \
@@ -556,8 +561,6 @@
*/
#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock)
#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b) vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b)
static __inline__ void vn_flagset(struct vnode *vp, uint flag)
{
@@ -566,13 +569,22 @@
spin_unlock(&vp->v_lock);
}
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+static __inline__ uint vn_flagclr(struct vnode *vp, uint flag)
{
+ uint cleared;
+
spin_lock(&vp->v_lock);
+ cleared = (vp->v_flag & flag);
vp->v_flag &= ~flag;
spin_unlock(&vp->v_lock);
+ return cleared;
}
+#define VMODIFY(vp) vn_flagset(vp, VMODIFIED)
+#define VUNMODIFY(vp) vn_flagclr(vp, VMODIFIED)
+#define VTRUNCATE(vp) vn_flagset(vp, VTRUNCATED)
+#define VUNTRUNCATE(vp) vn_flagclr(vp, VTRUNCATED)
+
/*
* Dealing with bad inodes
*/
@@ -612,8 +624,7 @@
#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages)
#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \
PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
-#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED)
+#define VN_TRUNC(vp) ((vp)->v_flag & VTRUNCATED)
/*
* Flags to VOP_SETATTR/VOP_GETATTR.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index cb36a56..35906ba 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -16,8 +16,6 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <linux/capability.h>
-
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
@@ -58,32 +56,14 @@
#include "xfs_log_priv.h"
#include "xfs_mac.h"
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-/*
- * For xfs, we check that the file isn't too big to be opened by this kernel.
- * No other open action is required for regular files. Devices are handled
- * through the specfs file system, pipes through fifofs. Device and
- * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
- * when a new vnode is first looked up or created.
- */
STATIC int
xfs_open(
bhv_desc_t *bdp,
cred_t *credp)
{
int mode;
- vnode_t *vp;
- xfs_inode_t *ip;
-
- vp = BHV_TO_VNODE(bdp);
- ip = XFS_BHVTOI(bdp);
+ vnode_t *vp = BHV_TO_VNODE(bdp);
+ xfs_inode_t *ip = XFS_BHVTOI(bdp);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return XFS_ERROR(EIO);
@@ -101,6 +81,36 @@
return 0;
}
+STATIC int
+xfs_close(
+ bhv_desc_t *bdp,
+ int flags,
+ lastclose_t lastclose,
+ cred_t *credp)
+{
+ vnode_t *vp = BHV_TO_VNODE(bdp);
+ xfs_inode_t *ip = XFS_BHVTOI(bdp);
+ int error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return XFS_ERROR(EIO);
+
+ if (lastclose != L_TRUE || !VN_ISREG(vp))
+ return 0;
+
+ /*
+ * If we previously truncated this file and removed old data in
+ * the process, we want to initiate "early" writeout on the last
+ * close. This is an attempt to combat the notorious NULL files
+ * problem which is particularly noticable from a truncate down,
+ * buffered (re-)write (delalloc), followed by a crash. What we
+ * are effectively doing here is significantly reducing the time
+ * window where we'd otherwise be exposed to that problem.
+ */
+ if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+ VOP_FLUSH_PAGES(vp, 0, -1, XFS_B_ASYNC, FI_NONE, error);
+ return error;
+}
/*
* xfs_getattr
@@ -665,9 +675,17 @@
((ip->i_d.di_nlink != 0 ||
!(mp->m_flags & XFS_MOUNT_WSYNC))
? 1 : 0));
- if (code) {
+ if (code)
goto abort_return;
- }
+ /*
+ * Truncated "down", so we're removing references
+ * to old data here - if we now delay flushing for
+ * a long time, we expose ourselves unduly to the
+ * notorious NULL files problem. So, we mark this
+ * vnode and flush it when the file is closed, and
+ * do not wait the usual (long) time for writeout.
+ */
+ VTRUNCATE(vp);
}
/*
* Have to do this even if the file's size doesn't change.
@@ -937,6 +955,13 @@
/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+/*
* xfs_readlink
*
*/
@@ -1470,9 +1495,6 @@
return 0;
}
-/*
- *
- */
STATIC int
xfs_inactive_attrs(
xfs_inode_t *ip,
@@ -1531,10 +1553,10 @@
vp = BHV_TO_VNODE(bdp);
ip = XFS_BHVTOI(bdp);
+ mp = ip->i_mount;
- if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
+ if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
return 0;
- }
/* If this is a read-only mount, don't do this (would generate I/O) */
if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
@@ -1546,8 +1568,6 @@
return 0;
#endif
- mp = ip->i_mount;
-
if (ip->i_d.di_nlink != 0) {
if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -3745,7 +3765,6 @@
return error;
}
-
int
xfs_set_dmattrs (
bhv_desc_t *bdp,
@@ -3786,10 +3805,6 @@
return error;
}
-
-/*
- * xfs_reclaim
- */
STATIC int
xfs_reclaim(
bhv_desc_t *bdp)
@@ -4645,6 +4660,7 @@
vnodeops_t xfs_vnodeops = {
BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
.vop_open = xfs_open,
+ .vop_close = xfs_close,
.vop_read = xfs_read,
#ifdef HAVE_SENDFILE
.vop_sendfile = xfs_sendfile,