ceph: flush dirty caps via the cap_dirty list

Previously we were flushing dirty caps by passing an extra flag
when traversing the delayed caps list.  Besides being a bit ugly,
that can also miss caps that are dirty but didn't result in a
cap requeue: notably, mark_caps_dirty().

Separate the flushing into a separate helper, and traverse the
cap_dirty list.

This also brings i_dirty_item in line with i_dirty_caps: we are
on the list IFF caps != 0.  We carry an inode ref IFF
dirty_caps|flushing_caps != 0.

Lose the unused return value from __ceph_mark_caps_dirty().

Signed-off-by: Sage Weil <sage@newdream.net>
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 111439d..40b8d34 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -997,7 +997,7 @@
 		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
 			dout(" release msg %p full\n", msg);
 			list_move_tail(&msg->list_head,
-				      &session->s_cap_releases_done);
+				       &session->s_cap_releases_done);
 		} else {
 			dout(" release msg %p at %d/%d (%d)\n", msg,
 			     (int)le32_to_cpu(head->num),
@@ -1292,14 +1292,20 @@
 	     ceph_cap_string(ci->i_flushing_caps | flushing));
 	ci->i_flushing_caps |= flushing;
 	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
 
 	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
 	if (list_empty(&ci->i_flushing_item)) {
-		list_del_init(&ci->i_dirty_item);
 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 		mdsc->num_cap_flushing++;
-		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
-		dout(" inode %p now flushing seq %lld\n", &ci->vfs_inode,
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
 		     ci->i_cap_flush_seq);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
@@ -1555,32 +1561,33 @@
  * Mark caps dirty.  If inode is newly dirty, add to the global dirty
  * list.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
-	int was = __ceph_caps_dirty(ci);
+	int was_dirty = ci->i_dirty_caps;
 	int dirty = 0;
 
 	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
 	     ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_dirty_caps | mask));
 	ci->i_dirty_caps |= mask;
-	if (!was) {
+	if (!was_dirty) {
 		dout(" inode %p now dirty\n", &ci->vfs_inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
 		spin_unlock(&mdsc->cap_dirty_lock);
-		igrab(inode);
-		dirty |= I_DIRTY_SYNC;
+		if (ci->i_flushing_caps == 0) {
+			igrab(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
 	}
-	if ((was & CEPH_CAP_FILE_BUFFER) &&
+	if (((was_dirty | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
 	__cap_delay_requeue(mdsc, ci);
-	return was;
 }
 
 /*
@@ -2327,7 +2334,7 @@
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
 	u64 flush_tid = le64_to_cpu(m->client_tid);
-	int old_dirty = 0, new_dirty = 0;
+	int drop = 0;
 	int i;
 
 	for (i = 0; i < CEPH_CAP_BITS; i++)
@@ -2344,9 +2351,7 @@
 	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
 		goto out;
 
-	old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
 	ci->i_flushing_caps &= ~cleaned;
-	new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	if (ci->i_flushing_caps == 0) {
@@ -2360,17 +2365,19 @@
 		mdsc->num_cap_flushing--;
 		wake_up(&mdsc->cap_flushing_wq);
 		dout(" inode %p now !flushing\n", inode);
-	}
-	if (old_dirty && !new_dirty) {
-		dout(" inode %p now clean\n", inode);
-		list_del_init(&ci->i_dirty_item);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+		}
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
 	wake_up(&ci->i_cap_wq);
 
 out:
 	spin_unlock(&inode->i_lock);
-	if (old_dirty && !new_dirty)
+	if (drop)
 		iput(inode);
 }
 
@@ -2676,14 +2683,11 @@
 /*
  * Delayed work handler to process end of delayed cap release LRU list.
  */
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc, int flushdirty)
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
 	struct ceph_inode_info *ci;
 	int flags = CHECK_CAPS_NODELAY;
 
-	if (flushdirty)
-		flags |= CHECK_CAPS_FLUSH;
-
 	dout("check_delayed_caps\n");
 	while (1) {
 		spin_lock(&mdsc->cap_delay_lock);
@@ -2704,6 +2708,32 @@
 }
 
 /*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	while (!list_empty(&mdsc->cap_dirty)) {
+		ci = list_first_entry(&mdsc->cap_dirty,
+				      struct ceph_inode_info,
+				      i_dirty_item);
+		inode = igrab(&ci->vfs_inode);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (inode) {
+			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+					NULL);
+			iput(inode);
+		}
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
  * Drop open file reference.  If we were the last open file,
  * we may need to release capabilities to the MDS (or schedule
  * their delayed release).
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2b19da3..12d66c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2504,7 +2504,7 @@
 	int renew_caps;
 
 	dout("mdsc delayed_work\n");
-	ceph_check_delayed_caps(mdsc, 0);
+	ceph_check_delayed_caps(mdsc);
 
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -2627,7 +2627,7 @@
 	mdsc->stopping = 1;
 
 	drop_leases(mdsc);
-	ceph_check_delayed_caps(mdsc, 1);
+	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
 }
 
@@ -2677,7 +2677,7 @@
 	mutex_unlock(&mdsc->mutex);
 	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 
-	ceph_check_delayed_caps(mdsc, 1);
+	ceph_flush_dirty_caps(mdsc);
 
 	wait_unsafe_requests(mdsc, want_tid);
 	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cfd39ef..0bbf58a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -524,7 +524,7 @@
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
 
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
@@ -814,8 +814,8 @@
 			       struct ceph_mds_session **psession);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			    struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc,
-				    int flushdirty);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
 				     int mds, int drop, int unless, int force);