RCU'd vfsmounts
* RCU-delayed freeing of vfsmounts
* vfsmount_lock replaced with a seqlock (mount_lock)
* sequence number from mount_lock is stored in nameidata->m_seq and
used when we exit RCU mode
* new vfsmount flag - MNT_SYNC_UMOUNT. Set by umount_tree() when its
caller knows that vfsmount will have no surviving references.
* synchronize_rcu() done between unlocking namespace_sem in namespace_unlock()
and doing pending mntput().
* new helper: legitimize_mnt(mnt, seq). Checks the mount_lock sequence
number against seq, then grabs reference to mnt. Then it rechecks mount_lock
again to close the race and either returns success or drops the reference it
has acquired. The subtle point is that in case of MNT_SYNC_UMOUNT we can
simply decrement the refcount and sod off - aforementioned synchronize_rcu()
makes sure that final mntput() won't come until we leave RCU mode. We need
that, since we don't want to end up with some lazy pathwalk racing with
umount() and stealing the final mntput() from it - caller of umount() may
expect it to return only once the fs is shut down and we don't want to break
that. In other cases (i.e. with MNT_SYNC_UMOUNT absent) we have to do
full-blown mntput() in case of mount_lock sequence number mismatch happening
just as we'd grabbed the reference, but in those cases we won't be stealing
the final mntput() from anything that would care.
* mntput_no_expire() doesn't lock anything on the fast path now. Incidentally,
SMP and UP cases are handled the same way - no ifdefs there.
* normal pathname resolution does *not* do any writes to mount_lock. It does,
of course, bump the refcounts of vfsmount and dentry in the very end, but that's
it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
diff --git a/fs/namei.c b/fs/namei.c
index 1f844fb..cb0ebae 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -484,14 +484,12 @@
static inline void lock_rcu_walk(void)
{
- br_read_lock(&vfsmount_lock);
rcu_read_lock();
}
static inline void unlock_rcu_walk(void)
{
rcu_read_unlock();
- br_read_unlock(&vfsmount_lock);
}
/**
@@ -512,26 +510,23 @@
BUG_ON(!(nd->flags & LOOKUP_RCU));
/*
- * Get a reference to the parent first: we're
- * going to make "path_put(nd->path)" valid in
- * non-RCU context for "terminate_walk()".
- *
- * If this doesn't work, return immediately with
- * RCU walking still active (and then we will do
- * the RCU walk cleanup in terminate_walk()).
+ * After legitimizing the bastards, terminate_walk()
+ * will do the right thing for non-RCU mode, and all our
+ * subsequent exit cases should rcu_read_unlock()
+ * before returning. Do vfsmount first; if dentry
+ * can't be legitimized, just set nd->path.dentry to NULL
+ * and rely on dput(NULL) being a no-op.
*/
- if (!lockref_get_not_dead(&parent->d_lockref))
+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
return -ECHILD;
-
- /*
- * After the mntget(), we terminate_walk() will do
- * the right thing for non-RCU mode, and all our
- * subsequent exit cases should unlock_rcu_walk()
- * before returning.
- */
- mntget(nd->path.mnt);
nd->flags &= ~LOOKUP_RCU;
+ if (!lockref_get_not_dead(&parent->d_lockref)) {
+ nd->path.dentry = NULL;
+ unlock_rcu_walk();
+ return -ECHILD;
+ }
+
/*
* For a negative lookup, the lookup sequence point is the parents
* sequence point, and it only needs to revalidate the parent dentry.
@@ -608,16 +603,21 @@
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+ unlock_rcu_walk();
+ return -ECHILD;
+ }
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
unlock_rcu_walk();
+ mntput(nd->path.mnt);
return -ECHILD;
}
if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
unlock_rcu_walk();
dput(dentry);
+ mntput(nd->path.mnt);
return -ECHILD;
}
- mntget(nd->path.mnt);
unlock_rcu_walk();
}
@@ -909,15 +909,15 @@
struct mount *parent;
struct dentry *mountpoint;
- br_read_lock(&vfsmount_lock);
+ read_seqlock_excl(&mount_lock);
parent = mnt->mnt_parent;
if (parent == mnt) {
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
@@ -1048,8 +1048,8 @@
/* Something is mounted on this dentry in another
* namespace and/or whatever was mounted there in this
- * namespace got unmounted before we managed to get the
- * vfsmount_lock */
+ * namespace got unmounted before lookup_mnt() could
+ * get it */
}
/* Handle an automount point */
@@ -1864,6 +1864,7 @@
if (flags & LOOKUP_RCU) {
lock_rcu_walk();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ nd->m_seq = read_seqbegin(&mount_lock);
} else {
path_get(&nd->path);
}
@@ -1872,6 +1873,7 @@
nd->root.mnt = NULL;
+ nd->m_seq = read_seqbegin(&mount_lock);
if (*name=='/') {
if (flags & LOOKUP_RCU) {
lock_rcu_walk();