writeback: bdi_writeback iteration must not skip dying ones bdi_for_each_wb() is used in several places to wake up or issue writeback work items to all wb's (bdi_writeback's) on a given bdi. The iteration is performed by walking bdi->cgwb_tree; however, the tree only indexes wb's which are currently active. For example, when a memcg gets associated with a different blkcg, the old wb is removed from the tree so that the new one can be indexed. The old wb starts dying from then on but will linger till all its inodes are drained. As these dying wb's may still host dirty inodes, writeback operations which affect all wb's must include them. bdi_for_each_wb() skipping dying wb's led to sync(2) missing and failing to sync the inodes belonging to those wb's. This patch adds a RCU protected @bdi->wb_list which lists all wb's beloinging to that bdi. wb's are added on creation and removed on release rather than on the start of destruction. bdi_for_each_wb() usages are replaced with list_for_each[_continue]_rcu() iterations over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed. v2: Updated as per Jan. last_wb ref leak in bdi_split_work_to_wbs() fixed and unnecessary list head severing in cgwb_bdi_destroy() removed. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-and-tested-by: Artem Bityutskiy <dedekind1@gmail.com> Fixes: ebe41ab0c79d ("writeback: implement bdi_for_each_wb()") Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <axboe@fb.com>

commit: b817525a4a80c04e4ca44192d97a1ffa9f2be572 [log] [tgz]
author: Tejun Heo <tj@kernel.org> Fri Oct 02 14:47:05 2015 -0400
committer: Jens Axboe <axboe@fb.com> Mon Oct 12 10:31:12 2015 -0600
tree: 70f7b2766d25209f6ca2d1c3370193562aded622
parent: 6fdf860f15d4a6be8f0947bad608d687fe0c7af7 [diff] [blame]
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0da306..29e4599 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c

@@ -778,19 +778,24 @@
 				  struct wb_writeback_work *base_work,
 				  bool skip_if_busy)
 {
-	int next_memcg_id = 0;
-	struct bdi_writeback *wb;
-	struct wb_iter iter;
+	struct bdi_writeback *last_wb = NULL;
+	struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+						struct bdi_writeback, bdi_node);
 
 	might_sleep();
 restart:
 	rcu_read_lock();
-	bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+	list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
 		DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
 		struct wb_writeback_work fallback_work;
 		struct wb_writeback_work *work;
 		long nr_pages;
 
+		if (last_wb) {
+			wb_put(last_wb);
+			last_wb = NULL;
+		}
+
 		/* SYNC_ALL writes out I_DIRTY_TIME too */
 		if (!wb_has_dirty_io(wb) &&
 		    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@
 
 		wb_queue_work(wb, work);
 
-		next_memcg_id = wb->memcg_css->id + 1;
+		/*
+		 * Pin @wb so that it stays on @bdi->wb_list.  This allows
+		 * continuing iteration from @wb after dropping and
+		 * regrabbing rcu read lock.
+		 */
+		wb_get(wb);
+		last_wb = wb;
+
 		rcu_read_unlock();
 		wb_wait_for_completion(bdi, &fallback_work_done);
 		goto restart;
 	}
 	rcu_read_unlock();
+
+	if (last_wb)
+		wb_put(last_wb);
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
 		if (!bdi_has_dirty_io(bdi))
 			continue;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
 					   false, reason);
 	}
@@ -1894,9 +1908,8 @@
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		struct bdi_writeback *wb;
-		struct wb_iter iter;
 
-		bdi_for_each_wb(wb, bdi, &iter, 0)
+		list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
 			if (!list_empty(&wb->b_dirty_time))
 				wb_wakeup(wb);
 	}
commit	b817525a4a80c04e4ca44192d97a1ffa9f2be572	[log] [tgz]
author	Tejun Heo <tj@kernel.org>	Fri Oct 02 14:47:05 2015 -0400
committer	Jens Axboe <axboe@fb.com>	Mon Oct 12 10:31:12 2015 -0600
tree	70f7b2766d25209f6ca2d1c3370193562aded622
parent	6fdf860f15d4a6be8f0947bad608d687fe0c7af7 [diff] [blame]